def test_any_method_after_close_throws_exception(): """ Calling any consumer method after close should thorw a RuntimeError """ c = Consumer({'group.id': 'test', 'enable.auto.commit': True, 'enable.auto.offset.store': False, 'socket.timeout.ms': 50, 'session.timeout.ms': 100}) c.subscribe(["test"]) c.unsubscribe() c.close() with pytest.raises(RuntimeError) as ex: c.subscribe(['test']) assert 'Consumer closed' == str(ex.value) with pytest.raises(RuntimeError) as ex: c.unsubscribe() assert 'Consumer closed' == str(ex.value) with pytest.raises(RuntimeError) as ex: c.poll() assert 'Consumer closed' == str(ex.value) with pytest.raises(RuntimeError) as ex: c.consume() assert 'Consumer closed' == str(ex.value) with pytest.raises(RuntimeError) as ex: c.assign([TopicPartition('test', 0)]) assert 'Consumer closed' == str(ex.value) with pytest.raises(RuntimeError) as ex: c.unassign() assert 'Consumer closed' == str(ex.value) with pytest.raises(RuntimeError) as ex: c.assignment() assert 'Consumer closed' == str(ex.value) with pytest.raises(RuntimeError) as ex: c.commit() assert 'Consumer closed' == str(ex.value) with pytest.raises(RuntimeError) as ex: c.committed([TopicPartition("test", 0)]) assert 'Consumer closed' == str(ex.value) with pytest.raises(RuntimeError) as ex: c.position([TopicPartition("test", 0)]) assert 'Consumer closed' == str(ex.value) with pytest.raises(RuntimeError) as ex: c.seek([TopicPartition("test", 0, 0)]) assert 'Consumer closed' == str(ex.value) with pytest.raises(RuntimeError) as ex: lo, hi = c.get_watermark_offsets(TopicPartition("test", 0)) assert 'Consumer closed' == str(ex.value)
def analytics_internet3_logs(): consumer = Consumer({'bootstrap.servers': kafka_hosts, 'group.id': 'Internet3_logs_%s' %dt, 'default.topic.config': {'auto.offset.reset': 'latest', 'auto.commit.enable': 'true'}}) consumer.subscribe(['haproxy_logs']) try: while True: msg = consumer.poll() if not msg.error(): Msg = msg.value().decode('utf-8').strip() try: tm = time.strftime('%Y%m%d%H%M', time.localtime()) if Msg: Msg = Msg.split() if len(Msg) >= 17: internet_access_minute = 'internet_access_minute_%s' % tm RC.incr(internet_access_minute) RC.expire(internet_access_minute,3600) except Exception as e: logging.error(e) continue elif msg.error().code() != KafkaError._PARTITION_EOF: logging.error(msg.error()) continue except Exception as e: logging.error(e) finally: consumer.close()
def test_basic_api(): """ Basic API tests, these wont really do anything since there is no broker configured. """ try: kc = Consumer() except TypeError as e: assert str(e) == "expected configuration dict" def dummy_commit_cb (err, partitions): pass kc = Consumer({'group.id':'test', 'socket.timeout.ms':'100', 'session.timeout.ms': 1000, # Avoid close() blocking too long 'on_commit': dummy_commit_cb}) kc.subscribe(["test"]) kc.unsubscribe() def dummy_assign_revoke (consumer, partitions): pass kc.subscribe(["test"], on_assign=dummy_assign_revoke, on_revoke=dummy_assign_revoke) kc.unsubscribe() msg = kc.poll(timeout=0.001) if msg is None: print('OK: poll() timeout') elif msg.error(): print('OK: consumer error: %s' % msg.error().str()) else: print('OK: consumed message') partitions = list(map(lambda p: TopicPartition("test", p), range(0,100,3))) kc.assign(partitions) kc.unassign() kc.commit(async=True) try: kc.commit(async=False) except KafkaException as e: assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._NO_OFFSET) # Get current position, should all be invalid. kc.position(partitions) assert len([p for p in partitions if p.offset == -1001]) == len(partitions) try: offsets = kc.committed(partitions, timeout=0.001) except KafkaException as e: assert e.args[0].code() == KafkaError._TIMED_OUT kc.close()
def consume(): c = Consumer({'bootstrap.servers': KAFKA_SERVER, 'group.id': 'mygroup', 'default.topic.config': {'auto.offset.reset': 'smallest'}}) c.subscribe([KAFKA_TOPIC]) while True: msg = c.poll() if not msg.error(): print('Received message: %s' % msg.value().decode('utf-8')) c.close()
def subscribe(): c = Consumer({'bootstrap.servers': '0', 'group.id': 'test-consumer-group', 'default.topic.config': {'auto.offset.reset': 'smallest'}}) c.subscribe(['neuronraindata']) while True: msg = c.poll() if not msg.error() and msg.value(): print('Received message: ' , msg.value().encode("utf-8")) else: print(msg.error()) c.close()
class KafkaWorkflowResultsReceiver(object): _requires = ['confluent-kafka'] def __init__(self, message_converter=ProtobufWorkflowResultsConverter, current_app=None): import walkoff.server.workflowresults # Need this import self.thread_exit = False kafka_config = walkoff.config.Config.WORKFLOW_RESULTS_KAFKA_CONFIG self.receiver = Consumer(kafka_config) self.topic = walkoff.config.Config.WORKFLOW_RESULTS_KAFKA_TOPIC self.message_converter = message_converter self.workflows_executed = 0 if current_app is None: self.current_app = Flask(__name__) self.current_app.config.from_object(walkoff.config.Config) self.current_app.running_context = context.Context(init_all=False) else: self.current_app = current_app def receive_results(self): """Constantly receives data from the Kafka Consumer and handles it accordingly""" logger.info('Starting Kafka workflow results receiver') self.receiver.subscribe(['{}.*'.format(self.topic)]) while not self.thread_exit: raw_message = self.receiver.poll(1.0) if raw_message is None: gevent.sleep(0.1) continue if raw_message.error(): if raw_message.error().code() == KafkaError._PARTITION_EOF: gevent.sleep(0.1) continue else: logger.error('Received an error in Kafka receiver: {}'.format(raw_message.error())) gevent.sleep(0.1) continue with self.current_app.app_context(): self._send_callback(raw_message.value()) self.receiver.close() return def _send_callback(self, message_bytes): event, sender, data = self.message_converter.to_event_callback(message_bytes) if sender is not None and event is not None: with self.current_app.app_context(): event.send(sender, data=data) if event in [WalkoffEvent.WorkflowShutdown, WalkoffEvent.WorkflowAborted]: self._increment_execution_count() def _increment_execution_count(self): self.workflows_executed += 1
def test_on_commit(): """ Verify that on_commit is only called once per commit() (issue #71) """ class CommitState(object): def __init__(self, topic, partition): self.topic = topic self.partition = partition self.once = True def commit_cb(cs, err, ps): print('on_commit: err %s, partitions %s' % (err, ps)) assert cs.once is True assert err == KafkaError._NO_OFFSET assert len(ps) == 1 p = ps[0] assert p.topic == cs.topic assert p.partition == cs.partition cs.once = False cs = CommitState('test', 2) c = Consumer({'group.id': 'x', 'enable.auto.commit': False, 'socket.timeout.ms': 50, 'session.timeout.ms': 100, 'on_commit': lambda err, ps: commit_cb(cs, err, ps)}) c.assign([TopicPartition(cs.topic, cs.partition)]) for i in range(1, 3): c.poll(0.1) if cs.once: # Try commit once try: c.commit(asynchronous=False) except KafkaException as e: print('commit failed with %s (expected)' % e) assert e.args[0].code() == KafkaError._NO_OFFSET c.close()
class KafkaWorkflowCommunicationReceiver(object): """Receives communication via Kafka and sends it to the executing workflow""" _requires = ['confluent-kafka'] def __init__(self, message_converter=ProtobufWorkflowCommunicationConverter): self._ready = False kafka_config = walkoff.config.Config.WORKFLOW_COMMUNICATION_KAFKA_CONFIG self.receiver = Consumer(kafka_config) self.topic = walkoff.config.Config.WORKFLOW_COMMUNICATION_KAFKA_TOPIC self.message_converter = message_converter self.exit = False if self.check_status(): self._ready = True def shutdown(self): self.exit = True self.receiver.close() def receive_communications(self): """Constantly receives data from the Kafka and handles it accordingly""" logger.info('Starting workflow communication receiver') while not self.exit: raw_message = self.receiver.poll(1.0) if raw_message is None: continue if raw_message.error(): if raw_message.error().code() == KafkaError._PARTITION_EOF: continue else: logger.error('Received an error in Kafka receiver: {}'.format(raw_message.error())) continue message = self.message_converter.to_received_message(raw_message.value()) if message is not None: yield message else: break raise StopIteration def is_ready(self): return self._ready def check_status(self): if self.receiver is not None: return True return False
def analytics_intranet_logs(): consumer = Consumer({'bootstrap.servers': kafka_hosts, 'group.id': 'Intranet_logs_%s' %dt,'default.topic.config': {'auto.offset.reset': 'latest','auto.commit.enable':'true'}}) consumer.subscribe(['haproxy2_logs']) try: while True: msg = consumer.poll() if not msg.error(): Msg = msg.value().decode('utf-8').strip() try: tt = time.strftime('%Y%m%d', time.localtime()) th = time.strftime('%Y%m%d%H', time.localtime()) tm = time.strftime('%Y%m%d%H%M', time.localtime()) H_key = 'haproxy2_topic_%s' % tt top2_url_hour = 'top2_url_hour_%s' % th top2_url_minute = 'top2_url_minute_%s' % tm if len(Msg.split()) >= 17: val = Msg.split('{') if len(val) >= 2: Topic = val[1].split('}')[0] Rtime = val[0].split()[8] Rtime = int(Rtime.split('/')[4]) if ':' in Topic: Topic = str(Topic.split(':')[0]) if '|' in Topic: Topic = str(Topic.split('|')[0]) if '.baihe.com' in Topic: Key = 'haproxy2_logs_%s_%s' % (tt, Topic) Rt_Key = 'Rtime2_%s_%s' % (tt, Topic) # 接口 PATH = str(Msg.split()[17]).split('?')[0] URL = 'http://%s%s' % (Topic,PATH) RC.zincrby(top2_url_hour, URL, 1) RC.zincrby(top2_url_minute, URL, 1) for KEY in (H_key, Key, Rt_Key,top2_url_hour,top2_url_minute): RC.expire(KEY,3600) RC.sadd(H_key, Topic) RC.incr(Key) if Rtime: RC.lpush(Rt_Key, Rtime) except Exception as e: logging.error(e) continue elif msg.error().code() != KafkaError._PARTITION_EOF: logging.error(msg.error()) continue except Exception as e: logging.error(e) finally: consumer.close()
def analytics_internet_logs(): consumer = Consumer({'bootstrap.servers': kafka_hosts, 'group.id': 'Internet_logs_%s' %dt,'default.topic.config': {'auto.offset.reset': 'latest','auto.commit.enable':'true'}}) consumer.subscribe(['haproxy_logs']) try: while True: msg = consumer.poll() if not msg.error(): Msg = msg.value().decode('utf-8').strip() try: tt = time.strftime('%Y%m%d', time.localtime()) th = time.strftime('%Y%m%d%H', time.localtime()) pv_key = 'baihe_pv_%s' % tt if Msg: Msg = Msg.split() RC.incr(pv_key) if len(Msg) >= 17: Topic = str(Msg[14]).split('|')[0].replace('{', '').strip() IP = str(Msg[5]) H_key = 'haproxy_topic_%s' % tt top_ip = 'top_ip_%s' % tt top_ip_hour = 'top_ip_%s' % th top_url_hour = 'top_url_%s' % th PATH = str(Msg[16]).split('?')[0] URL = 'http://%s%s' % (Topic,PATH) Ha_Key = 'haproxy_logs_%s_%s' % (tt, Topic) top_ip_domain = 'top_%s_domain_%s' % (IP, tt) top_ip_domain_hour = 'top_%s_domain_%s' % (IP, th) for KEY in (H_key, pv_key, top_ip, top_url_hour, top_ip_hour,Ha_Key, top_ip_domain, top_ip_domain_hour): RC.expire(KEY,3600) RC.sadd(H_key, Topic) RC.incr(Ha_Key) # ip RC.zincrby(top_ip, IP, 1) RC.zincrby(top_ip_hour, IP, 1) # IP_接口 RC.zincrby(top_ip_domain, URL, 1) RC.zincrby(top_ip_domain_hour, URL, 1) # 接口 RC.zincrby(top_url_hour, URL, 1) except: continue elif msg.error().code() != KafkaError._PARTITION_EOF: logging.error(msg.error()) continue except Exception as e: logging.error(e) finally: consumer.close()
def WAF_logs(): consumer = Consumer({'bootstrap.servers': kafka_hosts, 'group.id': 'Waf_logs_%s' %dt,'default.topic.config': {'auto.offset.reset': 'latest','auto.commit.enable':'true'}}) consumer.subscribe(['haproxy_logs']) try: while True: msg = consumer.poll() if not msg.error(): Msg = msg.value().decode('utf-8').strip() try: tm = time.strftime('%Y%m%d%H%M',time.localtime()) if Msg: Msg = Msg.split() if len(Msg) >= 17: url_code = Msg[9] Topic =str(Msg[14]).split('|')[0].replace('{','').strip() IP = str(Msg[5]) if url_code in ('200', '206', '301', '302', '304', '404'): top_ip_minute = 'top_ip_%s' % tm top_url_minute = 'top_url_%s' % tm PATH = str(Msg[16]).split('?')[0] URL = 'http://%s%s' % (Topic,PATH) top_ip_domain_minute = 'top_%s_domain_%s' % (IP, tm) top_url_ip_minute = 'top_%s_ip_%s' % (URL, tm) # ip RC.zincrby(top_ip_minute, IP, 1) RC.expire(top_ip_minute, 300) # IP_接口 RC.zincrby(top_ip_domain_minute, URL, 1) RC.expire(top_ip_domain_minute, 300) # 接口 RC.zincrby(top_url_minute, URL, 1) RC.expire(top_url_minute, 300) # 接口_ip RC.zincrby(top_url_ip_minute, IP, 1) RC.expire(top_url_ip_minute, 300) except Exception as e: logging.error(e) continue elif msg.error().code() != KafkaError._PARTITION_EOF: logging.error(msg.error()) continue except Exception as e: logging.error(e) finally: consumer.close()
async def consume_events(topic, group, brokers, callback, schema=None,registry=None,delay=0.01,**kwargs): """ Connect to the Kafka endpoint and start consuming messages from the given `topic`. The given callback is applied on each message. """ global consumer if topic in consumers: raise RuntimeError("A consumer already exists for topic: %s" % topic) if (not registry_serializer or not registry_client) and registry: r_client,serializer = create_registry_client(registry) consumer = Consumer({'bootstrap.servers': brokers, 'group.id': group, 'default.topic.config': {'auto.offset.reset': 'largest'}}) consumer.subscribe([topic]) consumers[topic] = consumer try: while True: message = consumer.poll(1) if message: if not message.error(): if registry: message = serializer.decode_message(message.value()) else: message = message.value() await callback(message) consumer.commit() else: await asyncio.sleep(delay) except KafkaException as ex: pass else: consumer.close() finally: consumers.pop(topic, None)
def main(): print("Creating Kafka Terminal Node") parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--name', type=str, required=True) parser.add_argument('--topic_subscriptions', type=str, required=True) parser.add_argument('--output_file', type=str, required=True) parser.add_argument('--broker_port_start', type=int, default=9092, required=False) parser.add_argument('--num_brokers', type=int, default=1, required=False) parsed_args = parser.parse_args() # Get the node's name, the topic name the Terminal # node will write to is the same as its name node_name = parsed_args.name # Get the topics that the Terminal node is subscribed to, remove all whitespaces topic_subscriptions = parsed_args.topic_subscriptions.split(',') # Get the processing function if the node has one processing_function = None if os.path.exists(os.getcwd() + "/sysfiles/{}.dill".format(node_name)): processing_function = dill.load( open("./sysfiles/{}.dill".format(node_name), "rb")) # Get the input file the datasource node will read from output_file = parsed_args.output_file # If there are multiple brokers available, the # Terminal node will connect to them all broker_port_start = parsed_args.broker_port_start num_brokers = parsed_args.num_brokers localhost = "docker.for.mac.localhost" bootstrap_server_str = "{}:{}".format(localhost, broker_port_start) for i in range(1, num_brokers): bootstrap_server_str += ",{}:{}".format(localhost, broker_port_start + i) c = Consumer({ 'bootstrap.servers': bootstrap_server_str, 'group.id': node_name, 'auto.offset.reset': 'earliest' }) c.subscribe(topic_subscriptions) # Read from the subscribed topics, process, and write results out to output file with open(output_file, "a") as file_handle: while True: # Consume an available message msg = c.poll(1.0) if msg is None: continue if msg.error(): print("{} encountered consumer error: {}".format( node_name, msg.error())) continue msg_value = msg.value().decode('utf-8') print('{} received message: {}'.format(node_name, msg_value)) # Process the message and write result to outgoing topic/stream # Flush writes immediately so new messages are visible asap processed_msg = msg_value if processing_function == None else str( processing_function(msg_value)) print("{} processed message: {}".format(node_name, processed_msg)) file_handle.write("{}\n".format(processed_msg)) file_handle.flush()
'bootstrap.servers': '<ccloud bootstrap servers>', 'broker.version.fallback': '0.10.0.0', 'api.version.fallback.ms': 0, 'sasl.mechanisms': 'PLAIN', 'security.protocol': 'SASL_SSL', 'sasl.username': '******', 'sasl.password': '******', 'group.id': str(uuid.uuid1()), # this will create a new consumer group on each invocation. 'auto.offset.reset': 'earliest' }) c.subscribe(['python-test-topic']) try: while True: msg = c.poll(0.1) # Wait for message or event/error if msg is None: # No message available within timeout. # Initial message consumption may take up to `session.timeout.ms` for # the group to rebalance and start consuming. continue if msg.error(): # Errors are typically temporary, print error and continue. print("Consumer error: {}".format(msg.error())) continue print('consumed: {}'.format(msg.value())) except KeyboardInterrupt: pass
class StreamAbsGen(object): def __init__(self,data_storage,data_source): #For Apache Cassandra, HBase and Hive, code from HivePythonClient.py for HiveServer2, #HBasePythonClient.py and CassandraPythonClient.py has been #replicated in __iter__(). #Possible storages: #self.data_storage="file" #self.data_storage="hive" #self.data_storage="hbase" #self.data_storage="cassandra" #self.data_storage="USBWWAN_stream" #self.data_storage="KingCobra" #self.data_storage="Spark_Parquet" #self.data_storage="AsFer_Encoded_Strings" self.data_storage=data_storage #Possible datasources: #self.data_source="RZF" #self.data_source="movielens" #self.data_source="USBWWAN" #self.data_source="file" #self.data_source="KingCobra" #self.data_source="Spark_Streaming" #self.data_source="NeuronRain" self.data_source=data_source if self.data_storage=="KingCobra": self.inputfile=open("/var/log/kingcobra/REQUEST_REPLY.queue") if self.data_storage=="AsFer_Encoded_Strings": self.inputfile=open("../cpp-src/asfer.enterprise.encstr") if self.data_storage=="file": self.inputfile=open(data_source,"r") if self.data_storage=="USBWWAN_stream": self.inputfile=open("../../usb-md-github-code/usb_wwan_modified/testlogs/kern.log.print_buffer_byte") if self.data_storage=="hbase": self.hbase_connection = happybase.Connection(host='localhost',port=9090,transport='buffered') self.hbase_table = self.hbase_connection.table('stream_data') print "StreamAbsGen:__init__():connected to HBase table" if self.data_storage=="hive": #pyhs2 client - requires SASL self.hive_conn=pyhs2.connect(host='localhost', port=10000, authMechanism="PLAIN", user='******', password='******', database='default') self.hive_cur=self.hive_conn.cursor() #Show databases print self.hive_cur.getDatabases() #Execute query self.hive_cur.execute("CREATE TABLE stream_data (alphanum STRING)") self.hive_cur.execute("select * from stream_data") #Return column info from query print self.hive_cur.getSchema() print "StreamAbsGen:__init__():connected to Hive table" if self.data_storage=="cassandra": self.cl=Cluster() self.session = self.cl.connect('cassandrakeyspace') inputf=open('movielens_stream2.data') for line in inputf: linetoks=line.split(' ') query='INSERT INTO stream_data(row_id,alphanum) VALUES (\''+linetoks[0]+'\',\''+linetoks[1]+'\');' print query session.execute(query) self.query='SELECT * FROM stream_data' self.resultrows=self.session.execute(self.query) print "StreamAbsGen:__init__(): connected to Cassandra" if self.data_storage=="Kafka": self.c = Consumer({'bootstrap.servers': '0', 'group.id': 'test-consumer-group', 'default.topic.config': {'auto.offset.reset': 'smallest'}}) self.c.subscribe(['neuronraindata']) if self.data_storage=="Socket_Streaming": self.streaming_host=self.data_source self.streaming_port=64001 if self.data_storage=="OperatingSystem": self.streaming_host="localhost" if self.data_storage=="TextHistogramPartition": self.partition_stream=[] for ds in data_source: self.partition_stream.append(open(ds,"r")) if self.data_storage=="DictionaryHistogramPartition": self.partition_stream=open(data_source,"r") def __iter__(self): if self.data_storage=="Spark_Parquet": self.spark=SparkSession.builder.getOrCreate() spark_stream_parquet=self.spark.read.parquet("../java-src/bigdata_analytics/spark_streaming/word.parquet") #spark_stream_parquet_DS=spark_stream_parquet.rdd.map(lambda row: (row.word)) spark_stream_parquet_DS=spark_stream_parquet.rdd.filter(lambda row: row.word not in [' ','or','and','who','he','she','whom','well','is','was','were','are','there','where','when','may', 'The', 'the', 'In','in','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z',' ','.', '"', ',', '{', '}', '+', '-', '*', '/', '%', '&', '(', ')', '[', ']', '=', '@', '#', ':', '|', ';','\'s','1','2','3','4','5','6','7','8','9','0']) for r in spark_stream_parquet_DS.collect(): print "StreamiAbsGen(Spark Parquet): iterator yielding %s" % r.word.encode("UTF-8") yield r.word.encode("UTF-8") if self.data_storage=="KingCobra": for i in self.inputfile: print "StreamAbsGen(file storage): iterator yielding %s" % i yield i if self.data_storage=="hbase": for key,value in self.hbase_table.scan(): print "StreamAbsGen(HBase storage): iterator yielding %s" % i yield value['cf:alphanum'] if self.data_storage=="AsFer_Encoded_Strings": for i in self.inputfile: print "StreamAbsGen(file storage): iterator yielding %s" % i yield i if self.data_storage=="file": for i in self.inputfile: words=i.split() for word in words: print "StreamAbsGen(file storage): iterator yielding %s" % word.strip() yield word.strip() if self.data_storage=="hive": #Fetch table results for i in self.hive_cur.fetch(): print "StreamAbsGen(Hive storage): iterator yielding %s" % i[0] yield i[0] if self.data_storage=="cassandra": for row in self.resultrows: #print row.row_id,' ',row.alphanum print "StreamAbsGen(Cassandra storage): iterator yielding %s" % row.alphanum yield row.alphanum if self.data_storage=="USBWWAN_stream": for i in self.inputfile: #print "StreamAbsGen(USBWWAN byte stream data): iterator yielding %s" % i yield i if self.data_storage=="Kafka": while True: print "Polling Kafka topic to receive message ..." msg = self.c.poll() if not msg.error() and msg.value(): print('Received message: ' , msg.value().encode("utf-8")) yield msg else: print(msg.error()) self.c.close() if self.data_storage=="Socket_Streaming": s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.connect((self.streaming_host,self.streaming_port)) print "socket_streaming_client(): host = ",self.streaming_host,"; post=",self.streaming_port data="" while data != None: data=s.recv(100) yield data if self.data_storage=="OperatingSystem" and self.data_source=="SchedulerRunQueue": from DeepLearning_SchedulerAnalytics import sched_debug_runqueue while True: schedrunqueue=sched_debug_runqueue() #df=DataFrame(data=schedrunqueue) #yield df yield schedrunqueue if self.data_storage=="TextHistogramPartition": self.sc = SparkContext() for ps in self.partition_stream: partition_stream_DS=self.sc.parallelize(ps.readlines()).flatMap(lambda line: line.split(" ")).map(lambda word: (word,[1])).reduceByKey(lambda v1,v2: v1+v2).groupByKey().mapValues(list) partition=partition_stream_DS.collect() print "partition:",partition if partition[0] is not '': print "StreamAbsGen(Spark Parquet): iterator yielding labelled partition: %s" % partition yield partition if self.data_storage=="DictionaryHistogramPartition": dict_stream=ast.literal_eval(self.partition_stream.read()) for d in dict_stream: yield d
sc = SlackClient(token) # Set 'auto.offset.reset': 'smallest' if you want to consume all messages # from the beginning of the topic settings = { 'bootstrap.servers': 'localhost:9092', 'group.id': 'python_kafka_notify.py', 'default.topic.config': {'auto.offset.reset': 'largest'} } c = Consumer(settings) c.subscribe(['UNHAPPY_PLATINUM_CUSTOMERS']) try: while True: msg = c.poll(0.1) time.sleep(5) if msg is None: continue elif not msg.error(): print('Received message: {0}'.format(msg.value())) if msg.value() is None: continue try: app_msg = json.loads(msg.value().decode()) except: app_msg = json.loads(msg.value()) try: email=app_msg['EMAIL'] message=app_msg['MESSAGE'] channel='unhappy-customers'
class KafkaConsumer: """Defines the base kafka consumer class""" def __init__( self, topic_name_pattern, message_handler, is_avro=True, offset_earliest=False, sleep_secs=1.0, consume_timeout=0.1, ): """Creates a consumer object for asynchronous use""" self.topic_name_pattern = topic_name_pattern self.message_handler = message_handler self.sleep_secs = sleep_secs self.consume_timeout = consume_timeout self.offset_earliest = offset_earliest # # # DONE: Configure the broker properties below. Make sure to reference the project README # and use the Host URL for Kafka and Schema Registry! # # self.broker_properties = { "bootstrap.servers": "PLAINTEXT://localhost:9092", "group.id": "opt-group" # # DONE # } # DONE: Create the Consumer, using the appropriate type. if is_avro is True: self.broker_properties[ "schema.registry.url"] = "http://localhost:8081" self.consumer = AvroConsumer(self.broker_properties) else: self.consumer = Consumer(self.broker_properties) # # # DONE: Configure the AvroConsumer and subscribe to the topics. Make sure to think about # how the `on_assign` callback should be invoked. # # self.consumer.subscribe([topic_name_pattern], on_assign=self.on_assign) def on_assign(self, consumer, partitions): """Callback for when topic assignment takes place""" # DONE: If the topic is configured to use `offset_earliest` set the partition offset to # the beginning or earliest logger.info("on_assign...") for partition in partitions: try: if self.offset_earliest: partition.offset = confluent_kafka.OFFSET_BEGINNING except: logger.info("something wrong with OFFSET_BEGINNING...") # # # DONE # # logger.info("partitions assigned for %s", self.topic_name_pattern) consumer.assign(partitions) async def consume(self): """Asynchronously consumes data from kafka topic""" while True: num_results = 1 while num_results > 0: num_results = self._consume() await gen.sleep(self.sleep_secs) def _consume(self): """Polls for a message. Returns 1 if a message was received, 0 otherwise""" # # # DONE: Poll Kafka for messages. Make sure to handle any errors or exceptions. # Additionally, make sure you return 1 when a message is processed, and 0 when no message # is retrieved. # # logger.info("consume message...") message = self.consumer.poll(1.0) if message is None: logger.info("no message") return 0 elif message.error() is not None: logger.info(f"error from consumer {message.error()}") return 0 elif message.value() is None: logger.info("empty message") return 0 else: logger.info(f"consumed message {message.key()}: {message.value()}") self.message_handler(message) return 1 def close(self): """Cleans up any open kafka consumers""" # # # DONE: Cleanup the kafka consumer # # self.consumer.close()
with open('config.yml', 'r') as file: config = yaml.safe_load(file.read()) settings = {'client.id': 'kafka-python-console-sample-consumer', 'group.id': 'kafka-python-console-sample-group', 'bootstrap.servers': ','.join(config['kafka']['brokers']), 'security.protocol': 'SASL_SSL', 'ssl.ca.location': '/etc/pki/tls/certs/ca-bundle.crt', 'sasl.mechanisms': 'PLAIN', 'sasl.username': config['kafka']['credentials']['username'], 'sasl.password': config['kafka']['credentials']['password'], 'api.version.request': True, 'enable.auto.commit': True, 'broker.version.fallback': '0.10.2.1', 'log.connection.close': False} print(settings) consumer = Consumer(settings) consumer.subscribe(config['kafka']['topics']) while True: msg = consumer.poll(timeout=10.0) if msg == None: time.sleep(0.1) elif type(msg.error()) == KafkaError: time.sleep(0.1) else: print(msg.value()) consumer.unsubscribe()
class KafkaConsumer: """Defines the base kafka consumer class""" def __init__( self, topic_name_pattern, message_handler, is_avro=True, offset_earliest=False, sleep_secs=1.0, consume_timeout=0.1, ): """Creates a consumer object for asynchronous use""" self.topic_name_pattern = topic_name_pattern self.message_handler = message_handler self.sleep_secs = sleep_secs self.consume_timeout = consume_timeout self.offset_earliest = offset_earliest self.broker_properties = { "bootstrap.servers": BROKER_URL, "group.id": "server_consumer", "auto.offset.reset": "earliest", } if is_avro is True: self.broker_properties["schema.registry.url"] = SCHEMA_REGISTRY_URL self.consumer = AvroConsumer(self.broker_properties) else: self.consumer = Consumer(self.broker_properties) self.consumer.subscribe( [self.topic_name_pattern], on_assign=self.on_assign, ) def on_assign(self, consumer, partitions): """Callback for when topic assignment takes place""" for partition in partitions: if self.offset_earliest is True: partition.offset = confluent_kafka.OFFSET_BEGINNING logger.info(f"partitions assigned for {self.topic_name_pattern}") consumer.assign(partitions) async def consume(self): """Asynchronously consumes data from kafka topic""" while True: num_results = 1 while num_results > 0: num_results = self._consume() await gen.sleep(self.sleep_secs) def _consume(self): """ Polls for a message. Returns 1 if a message was received, 0 otherwise """ try: message = self.consumer.poll(timeout=self.consume_timeout) except SerializerError as e: logger.error(f"Error while consuming data: {er.message}") return 0 if not message: logger.info("No messages received yet.") return 0 elif message.error() is not None: logger.error(f"Error encountered: {message.error()}") return 0 else: logger.info(f"Message consumed: {message}") self.message_handler(message) def close(self): """Cleans up any open kafka consumers""" self.consumer.close()
def run_commit_log_consumer( cluster_name, consumer_group, commit_log_topic, partition_state_manager, synchronize_commit_group, start_event, stop_request_event, ): start_event.set() logging.debug("Starting commit log consumer...") positions = {} # NOTE: The commit log consumer group should not be persisted into the # ``__consumer_offsets`` topic since no offsets are committed by this # consumer. The group membership metadata messages will be published # initially but as long as this group remains a single consumer it will # be deleted after the consumer is closed. # It is very important to note that the ``group.id`` **MUST** be unique to # this consumer process!!! This ensures that it is able to consume from all # partitions of the commit log topic and get a comprehensive view of the # state of the consumer groups it is tracking. consumer_config = kafka_config.get_kafka_consumer_cluster_options( cluster_name, override_params={ "group.id": consumer_group, "enable.auto.commit": "false", "enable.auto.offset.store": "true", "enable.partition.eof": "false", "default.topic.config": {"auto.offset.reset": "error"}, }, ) consumer = Consumer(consumer_config) def rewind_partitions_on_assignment(consumer, assignment): # The commit log consumer must start consuming from the beginning of # the commit log topic to ensure that it has a comprehensive view of # all active partitions. consumer.assign( [ TopicPartition( i.topic, i.partition, positions.get((i.topic, i.partition), OFFSET_BEGINNING) ) for i in assignment ] ) consumer.subscribe([commit_log_topic], on_assign=rewind_partitions_on_assignment) while not stop_request_event.is_set(): message = consumer.poll(1) if message is None: continue error = message.error() if error is not None: raise Exception(error) positions[(message.topic(), message.partition())] = message.offset() + 1 group, topic, partition, offset = get_commit_data(message) if group != synchronize_commit_group: logger.debug("Received consumer offsets update from %r, ignoring...", group) continue if offset in LOGICAL_OFFSETS: logger.debug( "Skipping invalid logical offset (%r) from %s/%s...", offset, topic, partition ) continue elif offset < 0: logger.warning( "Received unexpected negative offset (%r) from %s/%s!", offset, topic, partition ) partition_state_manager.set_remote_offset(topic, partition, offset)
class SynchronizedConsumer(object): """ This class implements the framework for a consumer that is intended to only consume messages that have already been consumed and committed by members of another consumer group. This works similarly to the Kafka built-in ``__consumer_offsets`` topic. The consumer group that is being "followed" (the one that must make progress for our consumer here to make progress, identified by the ``synchronize_commit_group`` constructor parameter/instance attribute) must report its offsets to a topic (identified by the ``commit_log_topic`` constructor parameter/instance attribute). This consumer subscribes to both commit log topic, as well as the topic(s) that we are actually interested in consuming messages from. The messages received from the commit log topic control whether or not consumption from partitions belonging to the main topic is paused, resumed, or allowed to continue in its current state without changes. The furthest point in any partition that this consumer should ever consume to is the maximum offset that has been recorded to the commit log topic for that partition. If the offsets recorded to that topic move non-monotonically (due to an intentional offset rollback, for instance) this consumer *may* consume up to the highest watermark point. (The implementation here tries to pause consuming from the partition as soon as possible, but this makes no explicit guarantees about that behavior.) """ initial_offset_reset_strategies = {"earliest": get_earliest_offset, "latest": get_latest_offset} def __init__( self, cluster_name, consumer_group, commit_log_topic, synchronize_commit_group, initial_offset_reset="latest", on_commit=None, ): self.cluster_name = cluster_name self.consumer_group = consumer_group self.commit_log_topic = commit_log_topic self.synchronize_commit_group = synchronize_commit_group self.initial_offset_reset = self.initial_offset_reset_strategies[initial_offset_reset] self.__partition_state_manager = SynchronizedPartitionStateManager( self.__on_partition_state_change ) ( self.__commit_log_consumer, self.__commit_log_consumer_stop_request, ) = self.__start_commit_log_consumer() self.__positions = {} def commit_callback(error, partitions): if on_commit is not None: return on_commit(error, partitions) consumer_configuration = kafka_config.get_kafka_consumer_cluster_options( cluster_name, override_params={ "group.id": self.consumer_group, "enable.auto.commit": "false", "enable.auto.offset.store": "true", "enable.partition.eof": "false", "default.topic.config": {"auto.offset.reset": "error"}, "on_commit": commit_callback, }, ) self.__consumer = Consumer(consumer_configuration) def __start_commit_log_consumer(self, timeout=None): """ Starts running the commit log consumer. """ stop_request_event = threading.Event() start_event = threading.Event() result = execute( functools.partial( run_commit_log_consumer, cluster_name=self.cluster_name, consumer_group="{}:sync:{}".format(self.consumer_group, uuid.uuid1().hex), commit_log_topic=self.commit_log_topic, synchronize_commit_group=self.synchronize_commit_group, partition_state_manager=self.__partition_state_manager, start_event=start_event, stop_request_event=stop_request_event, ) ) start_event.wait(timeout) return result, stop_request_event def __check_commit_log_consumer_running(self): if not self.__commit_log_consumer.running(): try: result = self.__commit_log_consumer.result(timeout=0) # noqa except TimeoutError: pass # not helpful raise Exception("Commit log consumer unexpectedly exit!") def __on_partition_state_change( self, topic, partition, previous_state_and_offsets, current_state_and_offsets ): """ Callback that is invoked when a partition state changes. """ logger.debug( "State change for %r: %r to %r", (topic, partition), previous_state_and_offsets, current_state_and_offsets, ) current_state, current_offsets = current_state_and_offsets if current_offsets.local is None: # It only makes sense to manipulate the consumer if we've got an # assignment. (This block should only be entered at startup if the # remote offsets are retrieved from the commit log before the local # consumer has received its assignment.) return # TODO: This will be called from the commit log consumer thread, so need # to verify that calling the ``consumer.{pause,resume}`` methods is # thread safe! if current_state in ( SynchronizedPartitionState.UNKNOWN, SynchronizedPartitionState.SYNCHRONIZED, SynchronizedPartitionState.REMOTE_BEHIND, ): self.__consumer.pause([TopicPartition(topic, partition, current_offsets.local)]) elif current_state is SynchronizedPartitionState.LOCAL_BEHIND: self.__consumer.resume([TopicPartition(topic, partition, current_offsets.local)]) else: raise NotImplementedError("Unexpected partition state: %s" % (current_state,)) def subscribe(self, topics, on_assign=None, on_revoke=None): """ Subscribe to a topic. """ self.__check_commit_log_consumer_running() def assignment_callback(consumer, assignment): # Since ``auto.offset.reset`` is set to ``error`` to force human # interaction on an offset reset, we have to explicitly specify the # starting offset if no offset has been committed for this topic during # the ``__consumer_offsets`` topic retention period. assignment = { (i.topic, i.partition): self.__positions.get((i.topic, i.partition)) for i in assignment } for i in self.__consumer.committed( [ TopicPartition(topic, partition) for (topic, partition), offset in assignment.items() if offset is None ] ): k = (i.topic, i.partition) if i.offset > -1: assignment[k] = i.offset else: assignment[k] = self.initial_offset_reset(consumer, i.topic, i.partition) self.__consumer.assign( [ TopicPartition(topic, partition, offset) for (topic, partition), offset in assignment.items() ] ) for (topic, partition), offset in assignment.items(): # Setting the local offsets will either cause the partition to be # paused (if the remote offset is unknown or the local offset is # not trailing the remote offset) or resumed. self.__partition_state_manager.set_local_offset(topic, partition, offset) self.__positions[(topic, partition)] = offset if on_assign is not None: on_assign( self, [TopicPartition(topic, partition) for topic, partition in assignment.keys()], ) def revocation_callback(consumer, assignment): for item in assignment: # TODO: This should probably also be removed from the state manager. self.__positions.pop((item.topic, item.partition)) if on_revoke is not None: on_revoke(self, assignment) self.__consumer.subscribe( topics, on_assign=assignment_callback, on_revoke=revocation_callback ) def poll(self, timeout): self.__check_commit_log_consumer_running() message = self.__consumer.poll(timeout) if message is None: return if message.error() is not None: return message self.__partition_state_manager.validate_local_message( message.topic(), message.partition(), message.offset() ) self.__partition_state_manager.set_local_offset( message.topic(), message.partition(), message.offset() + 1 ) self.__positions[(message.topic(), message.partition())] = message.offset() + 1 return message def commit(self, *args, **kwargs): self.__check_commit_log_consumer_running() return self.__consumer.commit(*args, **kwargs) def close(self): self.__check_commit_log_consumer_running() self.__commit_log_consumer_stop_request.set() try: self.__consumer.close() finally: self.__commit_log_consumer.result()
class KafkaConsumer: """ Defines the base kafka consumer class """ def __init__(self, topic_name_pattern, message_handler, is_avro=True, offset_earliest=False, sleep_secs=1.0, consume_timeout=0.1): """ Creates a consumer object for asynchronous use """ self.topic_name_pattern = topic_name_pattern self.message_handler = message_handler self.sleep_secs = sleep_secs self.consume_timeout = consume_timeout self.offset_earliest = offset_earliest # configure broker properties for consumer self.broker_properties = { "bootstrap.servers": KAFKA_BROKER_URL, "group.id": topic_name_pattern, "auto.offset.reset": "earliest" } # Create the Consumer, using the appropriate type. if is_avro is True: self.broker_properties["schema.registry.url"] = "http://localhost:8081" self.consumer = AvroConsumer(self.broker_properties) else: self.consumer = Consumer(self.broker_properties) # Configure the AvroConsumer and subscribe to the topics. Make sure to think about # how the `on_assign` callback should be invoked. self.consumer.subscribe( [self.topic_name_pattern], # always a list on_assign=self.on_assign ) def on_assign(self, consumer, partitions): """ Callback for when topic assignment takes place """ # If the topic is configured to use `offset_earliest` set the partition offset to # the beginning or earliest for partition in partitions: if self.offset_earliest == True: partition.offset = OFFSET_BEGINNING logger.info(f"partitions assigned for {self.topic_name_pattern}") consumer.assign(partitions) async def consume(self): """ Asynchronously consumes data from kafka topic """ while True: num_results = 1 while num_results > 0: num_results = self._consume() await gen.sleep(self.sleep_secs) def _consume(self): """Polls for a message. Returns 1 if a message was received, 0 otherwise""" try: message = self.consumer.poll(timeout=self.consume_timeout) except: logger.error(f"Message poll failed for: {self.topic_name_pattern}") return 0 if message is None: logger.info("No message received by consumer") return 0 elif message.error() is not None: logger.error(f"Error from consumer: {message.error()}") return 0 self.message_handler(message) logger.info( f"Consumed message - {message.key()}: {message.value()}") return 1 def close(self): """ Cleans up any open kafka consumers """ logger.info(f"closing consumer for {self.topic_name_pattern}") self.consumer.close()
from confluent_kafka import Consumer if __name__ == '__main__': config = { 'bootstrap.servers': '127.0.0.1:9092', 'group.id': '123', 'default.topic.config': { 'auto.offset.reset': 'earliest' }, 'enable.auto.commit': True } def print_assignment(consumer, partitions): print('Assignment: {}'.format(partitions)) kafka_consumer = Consumer(config) kafka_consumer.subscribe(['PiEstimationMontecarlo-65f6529bcba1'], on_assign=print_assignment) print('consuming') while True: msg = kafka_consumer.poll(timeout=1.0) if msg is None: print('null message') else: print(msg.value())
kafka_consumer = Consumer({ 'bootstrap.servers': "kafka:9092", 'group.id': 'python-consumer', 'default.topic.config': { 'auto.offset.reset': 'smallest' } }) kafka_producer = Producer({ 'bootstrap.servers': "kafka:9092", }) kafka_consumer.subscribe(['snowplow_enriched_good']) while True: msg = kafka_consumer.poll(1.0) if msg is None: continue if msg.error(): if msg.error().code() == KafkaError._PARTITION_EOF: continue else: print(msg.error()) break event = msg.value().decode('utf-8') try: json_data = snowplow_analytics_sdk.event_transformer.transform(event) kafka_producer.poll(0) kafka_producer.produce('snowplow_json_event',
class KafkaConsumerWorker(BaseWorker): topic_name = None consumer_name = None consumer_settings = {} commit_on_complete = True async_commit = True poll_timeout = 0.01 sleep_time = 0.05 timestamp_fields = ['timestamp'] decimal_fields = [] boolean_fields = [] def setup(self): self.consumer = Consumer(**self.get_consumer_settings()) self.serializer = self.get_message_serializer() self.set_topic() def teardown(self): self.consumer.close() def get_topic_name(self): if self.topic_name is None: raise NotImplementedError return self.topic_name def get_consumer_name(self): if self.consumer_name is None: raise NotImplementedError return self.consumer_name def get_broker_url(self): broker_url = settings.BROKER_URL if broker_url is None: raise NotImplementedError return broker_url def get_zookeeper_url(self): zookeeper_url = settings.ZOOKEEPER_URL if zookeeper_url is None: raise NotImplementedError return zookeeper_url def get_consumer_settings(self): broker_url = self.get_broker_url() logger.debug('connecting to kafka: ' + broker_url) consumer_name = self.get_consumer_name() logger.debug('using group id: ' + consumer_name) initial_settings = { 'api.version.request': True, 'broker.version.fallback': '0.9.0', 'client.id': 'JanglConsumer', 'bootstrap.servers': broker_url, 'group.id': consumer_name, 'default.topic.config': {'auto.offset.reset': 'earliest'}, 'enable.auto.commit': False, 'on_commit': self.on_commit, 'session.timeout.ms': 10000, 'heartbeat.interval.ms': 1000, } return generate_client_settings(initial_settings, self.consumer_settings) def get_message_serializer(self): schema_registry_url = self.get_schema_registry_url() logger.debug('loading schema registry: ' + schema_registry_url) schema_client = CachedSchemaRegistryClient(url=schema_registry_url) return MessageSerializer(schema_client) def get_schema_registry_url(self): schema_microservice = settings.SCHEMA_MICROSERVICE if schema_microservice: schema_registry_url = get_service_url(schema_microservice) else: schema_registry_url = settings.SCHEMA_REGISTRY_URL if schema_registry_url is None: raise NotImplementedError return schema_registry_url def set_topic(self): topic_name = self.get_topic_name() logger.debug('set kafka topic: ' + topic_name) self.consumer.subscribe([topic_name], on_assign=self.on_assign, on_revoke=self.on_revoke) def on_assign(self, consumer, partitions): logger.debug('partitions assigned: {}'.format(partitions)) consumer.assign(partitions) def on_revoke(self, consumer, partitions): logger.debug('partitions revoked: {}'.format(partitions)) try: consumer.commit(async=False) except KafkaException: pass consumer.unassign() def on_commit(self, err, partitions): if err is None: logger.debug('commit done: {}'.format(partitions)) else: logger.error('commit error: {} - {}'.format(err, partitions)) def handle(self): message = self.consumer.poll(timeout=self.poll_timeout) if message is not None: if message.error(): if message.error().code() == KafkaError._PARTITION_EOF: # End of partition event logger.info('%% %s [%d] reached end at offset %d\n' % (message.topic(), message.partition(), message.offset())) elif message.error(): raise KafkaException(message.error()) else: message = DecodedMessage(self.serializer, message) message = self.parse_message(message) self.consume_message(message) if self.commit_on_complete: self.commit() self.done() else: self.wait() def parse_message(self, message): for field in self.timestamp_fields: if field in message: try: message[field] = datetime.fromtimestamp(message[field], utc) except ValueError: try: message[field] = datetime.fromtimestamp(message[field]/1000, utc) except TypeError: pass except TypeError: pass for field in self.decimal_fields: if field in message: try: message[field] = decimal.Decimal(message[field]) except (TypeError, decimal.InvalidOperation): pass for field in self.boolean_fields: if field in message: try: message[field] = bool(message[field]) except TypeError: pass return message def commit(self): if not self.consumer_settings.get('enable.auto.commit'): self.consumer.commit(async=self.async_commit) def consume_message(self, message): pass
class AnomalyDetectorPipeline(object): def __init__(self): self.kafka_consumer = Consumer({ 'bootstrap.servers': KAFKA_SERVERS, 'group.id': KAFKA_MESSAGE_CONSUMER_GROUP, 'session.timeout.ms': 6000, 'auto.offset.reset': 'latest' }) self.kafka_consumer.subscribe([KAFKA_MESSAGE_TOPIC]) LOG.info('Connected with Kafka Consumer') self.kafka_producer = Producer({'bootstrap.servers': KAFKA_SERVERS}) LOG.info('Connected with Kafka Producer') hosts_sentinel = [] endpoint = DBAAS_SENTINEL_ENDPOINT_SIMPLE.replace("sentinel://", "") for host in endpoint.split(','): hosts_sentinel.append(tuple(host.split(':'))) sentinel = Sentinel(hosts_sentinel, socket_timeout=REDIS_SOCKET_TIMEOUT) self.rediscon = sentinel.master_for( DBAAS_SENTINEL_SERVICE_NAME, socket_timeout=REDIS_SOCKET_TIMEOUT, password=DBAAS_SENTINEL_PASSWORD) LOG.info('Connected with Redis Database') def main(self): msg_consumed_count = 0 LOG.info('Message Pipeline Start') while True: message = self.kafka_consumer.poll(1.0) if message is None: continue if message.error(): error = "Kafka consumer error: {}".format(message.error()) LOG.error(error) continue metric = self.message2metric(message.value().decode('utf-8')) if metric['metric'] == 'cpu': self.handle_cpu_metric(metric) elif metric['metric'] == 'disk': self.handle_disk_metric(metric) msg_consumed_count += 1 if msg_consumed_count == LOG_COUNT: self.kafka_producer.flush() msg = "{} messages read. Last metric collected at: {}".format( msg_consumed_count, metric['time_collected']) LOG.info(msg) msg_consumed_count = 0 def send_result(self, anomaly_result): output = "anomaly_score,host={},app={}".format(anomaly_result['host'], anomaly_result['app']) output += ",{}={},{}={},{}={},{}={}".format( 'time_collected', anomaly_result['time_collected'].replace(' ', '\ '), 'time_pipeline', anomaly_result['time_pipeline'].replace(' ', '\ '), 'time_detector', anomaly_result['time_detector'].replace(' ', '\ '), 'period_description', anomaly_result['period_description'].replace(' ', '\ ')) output += " value={},anomaly_score={},is_anomaly={} {}\n".format( anomaly_result['value'], anomaly_result['anomaly_score'], anomaly_result['is_anomaly'], anomaly_result['ts'], ) self.kafka_producer.poll(0) self.kafka_producer.produce(KAFKA_ANOMALY_TOPIC, output.encode('utf-8')) def message2metric(self, message): tags, fields, ts = message.split(' ') metric = tags.split(',')[0] ts = ts.strip() tags_dict = {} for tag in tags.split(',')[1:]: key, value = tag.split('=') tags_dict[key] = value fields_dict = {} for field in fields.split(','): key, value = field.split('=') fields_dict[key] = value host = tags_dict.get('host') metric_dict = { 'metric': metric, 'ts': ts, 'tags': tags_dict, 'fields': fields_dict, 'host': host, 'time_collected': str(datetime.datetime.fromtimestamp(int(ts[:-9]))), 'time_pipeline': str(datetime.datetime.now()) } return metric_dict def handle_cpu_metric(self, data): if data['tags']['cpu'] != 'cpu-total': return usage_user = self.get_basic_metric_dict(data) usage_user.update({ 'app': 'dbaas.cpu.usage_user.{}'.format(data['host']), 'value': float(data['fields']['usage_user']), }) usage_system = self.get_basic_metric_dict(data) usage_system.update({ 'app': 'dbaas.cpu.usage_system.{}'.format(data['host']), 'value': float(data['fields']['usage_system']), }) usage_idle = self.get_basic_metric_dict(data) usage_idle.update({ 'app': 'dbaas.cpu.usage_idle.{}'.format(data['host']), 'value': float(data['fields']['usage_idle']), }) usage_iowait = self.get_basic_metric_dict(data) usage_iowait.update({ 'app': 'dbaas.cpu.usage_iowait.{}'.format(data['host']), 'value': float(data['fields']['usage_iowait']), }) self.analyze_metric(usage_user) self.analyze_metric(usage_system) self.analyze_metric(usage_idle) self.analyze_metric(usage_iowait) def handle_disk_metric(self, data): if data['tags']['path'] != '/data': return used_percent = self.get_basic_metric_dict(data) used_percent.update({ 'app': 'dbaas.disk.used_percent.{}'.format(data['host']), 'value': float(data['fields']['used_percent']), }) self.analyze_metric(used_percent) def get_basic_metric_dict(self, data): return { 'host': data['host'], 'ts': data['ts'], 'time_collected': data['time_collected'], 'time_pipeline': data['time_pipeline'], } def analyze_metric(self, data): app = data['app'] ts = data['ts'] imput_time = datetime.datetime.fromtimestamp(int(ts[:-9])) value = data['value'] packed_object = self.rediscon.get(app) if packed_object: detector = pickle.loads(packed_object) else: detector = DASRS() anomaly_score = detector.getAnomalyScore(value, imput_time) period_description = detector.get_period_description() is_anomaly = detector.is_anomaly(anomaly_score) packed_object = pickle.dumps(detector) self.rediscon.set(app, packed_object) data.update({ 'anomaly_score': anomaly_score, 'time_detector': str(datetime.datetime.now()), 'is_anomaly': int(is_anomaly), 'period_description': period_description }) self.send_result(data)
class InventoryEventsConsumer: """Inventory events consumer.""" def __init__(self): """Create a Inventory Events Consumer.""" self.consumer = Consumer({ 'bootstrap.servers': INSIGHTS_KAFKA_ADDRESS, 'group.id': GROUP_ID, 'enable.auto.commit': False }) # Subscribe to topic self.consumer.subscribe([INVENTORY_EVENTS_TOPIC]) self.event_type_map = { 'delete': self.host_delete_event, 'created': self.host_create_update_events, 'updated': self.host_create_update_events } self.prefix = 'PROCESSING INVENTORY EVENTS' def __iter__(self): return self def __next__(self): msg = self.consumer.poll() if msg is None: raise StopIteration return msg def run(self): """Initialize Consumer.""" for msg in iter(self): if msg.error(): print(msg.error()) raise KafkaException(msg.error()) try: msg = json.loads(msg.value().decode("utf-8")) event_type = msg['type'] if event_type in self.event_type_map.keys(): handler = self.event_type_map[event_type] handler(msg) else: LOG.info('Event Handling is not found for event %s - %s', event_type, self.prefix) except json.decoder.JSONDecodeError: LOG.error('Unable to decode kafka message: %s - %s', msg.value(), self.prefix) except Exception as err: LOG.error( 'An error occurred during message processing: %s in the system %s created from account: %s - %s', repr(err), msg['host']['id'], msg['host']['account'], self.prefix, ) finally: self.consumer.commit() LOG.warning("Stopping inventory consumer") self.consumer.close() def host_delete_event(self, msg): """Process delete message.""" self.prefix = "PROCESSING DELETE EVENT" host_id = msg['id'] insights_id = msg['insights_id'] with app.app_context(): LOG.info( 'Deleting performance profile records with insights_id %s - %s', insights_id, self.prefix) rows_deleted = db.session.query( System.id).filter(System.inventory_id == host_id).delete() if rows_deleted > 0: LOG.info('Deleted host from inventory with id: %s - %s', host_id, self.prefix) db.session.commit() def host_create_update_events(self, msg): """ Process created/updated message ( create system record, store new report )""" self.prefix = "PROCESSING Create/Update EVENT" if 'is_ros' in msg['platform_metadata']: self.process_system_details(msg) def process_system_details(self, msg): """ Store new system information (stale, stale_warning timestamp) and return internal DB id""" host = msg['host'] performance_record = get_performance_profile( msg['platform_metadata']['url']) if performance_record: performance_utilization = self._calculate_performance_utilization( performance_record, host) with app.app_context(): account = get_or_create(db.session, RhAccount, 'account', account=host['account']) system = get_or_create( db.session, System, 'inventory_id', account_id=account.id, inventory_id=host['id'], display_name=host['display_name'], fqdn=host['fqdn'], cloud_provider=host['system_profile']['cloud_provider'], instance_type=performance_record.get('instance_type'), stale_timestamp=host['stale_timestamp']) get_or_create(db.session, PerformanceProfile, ['system_id', 'report_date'], system_id=system.id, performance_record=performance_record, performance_utilization=performance_utilization, report_date=datetime.datetime.utcnow().date()) # Commit changes db.session.commit() LOG.info( "Refreshed system %s (%s) belonging to account: %s (%s) via report-processor", system.inventory_id, system.id, account.account, account.id) def _calculate_performance_utilization(self, performance_record, host): MAX_IOPS_CAPACITY = 16000 memory_utilized = (float(performance_record['mem.util.used']) / float(performance_record['mem.physmem'])) * 100 cpu_utilized = self._calculate_cpu_score(performance_record) cloud_provider = host['system_profile']['cloud_provider'] if cloud_provider == 'aws': MAX_IOPS_CAPACITY = 16000 if cloud_provider == 'azure': MAX_IOPS_CAPACITY = 20000 io_utilized = (float(performance_record['disk.all.total']) / float(MAX_IOPS_CAPACITY)) * 100 performance_utilization = { 'memory': int(memory_utilized), 'cpu': int(cpu_utilized), 'io': int(io_utilized) } return performance_utilization def _calculate_cpu_score(self, performance_record): idle_cpu_percent = ( (float(performance_record['kernel.all.cpu.idle']) * 100) / int(performance_record['total_cpus'])) cpu_utilized_percent = 100 - idle_cpu_percent return cpu_utilized_percent
class Consumer: consumer_settings = {} running = False def __init__(self, servers, group_id, topics, auth_params, enable_auto_commit=False, auto_offset_reset='earliest', more_settings: dict = None): if len(servers) == 0: raise KafkaSettingsError('Empty servers') if not group_id: raise KafkaSettingsError('GroupId must be not empty') if not isinstance(topics, list): raise KafkaSettingsError('Topics must be a list') if len(topics) == 0: raise KafkaSettingsError('Empty topics') self.servers = servers self.topics = topics self.group_id = group_id self.enable_auto_commit = enable_auto_commit self.auto_offset_reset = auto_offset_reset self.consumer = None self.auth_params = auth_params self.__edit_settings() self.consumer_settings = { "api.version.request": True, "enable.auto.commit": self.enable_auto_commit, "group.id": self.group_id, "bootstrap.servers": self.servers, "default.topic.config": { "auto.offset.reset": self.auto_offset_reset } } self.consumer_settings.update(self.auth_params) if more_settings: self.consumer_settings.update(more_settings) def __edit_settings(self): if self.auth_params: old_settings = self.auth_params.copy() self.auth_params = {} self.auth_params = { 'security.protocol': 'ssl', 'ssl.key.location': old_settings['ssl_keyfile'], 'ssl.certificate.location': old_settings['ssl_certfile'], 'ssl.ca.location': old_settings['ssl_cafile'] } if isinstance(self.servers, list): old_servers = list(self.servers) self.servers = ','.join(old_servers) def create_consumer(self): from confluent_kafka import Consumer as ConfluentConsumer self.consumer = ConfluentConsumer(self.consumer_settings) self.consumer.subscribe(topics=self.topics) return self.consumer def read_topic(self): self.running = True while self.running: message = self.consumer.poll() msg = message.value().decode() if msg == 'Broker: No more messages': time.sleep(1) continue try: json.loads(msg) except json.decoder.JSONDecodeError: continue application_message = Message() application_message.value = json.loads(msg) application_message.topic = message.topic() application_message.partition = message.partition() application_message.offset = message.offset() application_message.key = message.key() yield application_message def __del__(self): self.consumer.close()
class KafkaConsumer: """Defines the base kafka consumer class""" def __init__( self, topic_name_pattern, message_handler, is_avro=True, offset_earliest=False, sleep_secs=1.0, consume_timeout=0.1, ): """Creates a consumer object for asynchronous use""" self.topic_name_pattern = topic_name_pattern self.message_handler = message_handler self.sleep_secs = sleep_secs self.consume_timeout = consume_timeout self.offset_earliest = offset_earliest self.broker_properties = { 'bootstrap.servers': 'PLAINTEXT://localhost:9094', 'group.id': topic_name_pattern, 'default.topic.config': { 'auto.offset.reset': 'earliest' } } # Create the Consumer, using the appropriate type. if is_avro is True: self.broker_properties[ "schema.registry.url"] = "http://localhost:8081" self.consumer = AvroConsumer(self.broker_properties) else: self.consumer = Consumer(self.broker_properties) pass # Configure the AvroConsumer and subscribe to the topics. Make sure to think about # how the `on_assign` callback should be invoked. self.consumer.subscribe([topic_name_pattern], on_assign=self.on_assign) def on_assign(self, consumer, partitions): """Callback for when topic assignment takes place""" for partition in partitions: consumer.seek(partition) logger.info("partitions assigned for %s", self.topic_name_pattern) consumer.assign(partitions) async def consume(self): """Asynchronously consumes data from kafka topic""" while True: num_results = 1 while num_results > 0: num_results = self._consume() await gen.sleep(self.sleep_secs) def _consume(self): """Polls for a message. Returns 1 if a message was received, 0 otherwise""" while True: message = self.consumer.poll(self.consumer.consume_timeout) if message is None: logger.WARN("no message received by consumer") return 0 elif message.error() is not None: logger.ERROR(f"error from consumer {message.error()}") return 0 else: return 1 def close(self): """Cleans up any open kafka consumers""" self.consumer.close()
workbook = xlsxwriter.Workbook('./output/word-count.xlsx') worksheet = workbook.add_worksheet() row = 0 col = 0 for item in list_of_messages: worksheet.write(row, col, item['text']) worksheet.write(row, col + 1, item['count']) row += 1 workbook.close() schedule.every(FILE_GENERATION_INTERVAL).seconds.do(write_to_excel) while True: schedule.run_pending() msg = consumer.poll(1.0) if msg is None: continue if msg.error(): print("Consumer error: {}".format(msg.error())) continue messages[msg.key().decode('utf-8')] = int(msg.value()) consumer.close()
def kafka_local_file(opticons=None,broker='',group='',topics=''): broker = argv[0] group = argv[1] topics = argv[2:] # Consumer configuration # See https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md conf = {'bootstrap.servers': broker, 'group.id': group, 'session.timeout.ms': 6000, 'default.topic.config': {'auto.offset.reset': 'smallest'}} # Check to see if -T option exists for opt in optlist: if opt[0] != '-T': continue try: intval = int(opt[1]) except ValueError: sys.stderr.write("Invalid option value for -T: %s\n" % opt[1]) sys.exit(1) if intval <= 0: sys.stderr.write("-T option value needs to be larger than zero: %s\n" % opt[1]) sys.exit(1) conf['stats_cb'] = stats_cb conf['statistics.interval.ms'] = int(opt[1]) # Create logger for consumer (logs will be emitted when poll() is called) logger = logging.getLogger('consumer') logger.setLevel(logging.DEBUG) handler = logging.StreamHandler() handler.setFormatter(logging.Formatter('%(asctime)-15s %(levelname)-8s %(message)s')) logger.addHandler(handler) # Create Consumer instance # Hint: try debug='fetch' to generate some log messages c = Consumer(conf, logger=logger) def print_assignment(consumer, partitions): print('Assignment:', partitions) # Subscribe to topics c.subscribe(topics, on_assign=print_assignment) # hdfs login #client = hdfs.Client('http://%s:50070' % (hdfshost)) # client = InsecureClient('http://%s:50070' % (hdfshost),user='******') # Read messages from Kafka, print to stdout try: while True: logtime = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) msg = c.poll(timeout=1.0) if msg is None: continue if msg.error(): # Error or event if msg.error().code() == KafkaError._PARTITION_EOF: # End of partition event sys.stderr.write('%s %s [%d] reached end at offset %d\n' % (logtime, msg.topic(), msg.partition(), msg.offset())) elif msg.error(): # Error raise KafkaException(msg.error()) else: msgstr = msg.value().decode('utf-8') msgdict = json.loads(msgstr,encoding="uft-8",object_pairs_hook=OrderedDict) database = msgdict.get('database').encode() table = msgdict.get('table').encode() type = msgdict.get('type').encode() if type == 'insert': data = msgdict.get('data') datalist = data.values() datastr = ','.join('%s' % id for id in datalist).encode() hour = time.strftime('%Y-%m-%d-%H',time.localtime(time.time())) localfile = '/mnt/var/%s.%s.%s.%s' % (database,table,type,hour) sys.stderr.write('%s %s [%d] at offset %d with key %s:\n' % (logtime,msg.topic(),msg.partition(),msg.offset(),msgstr)) with open(localfile,'a') as writer: writer.write(datastr+'\n') else: sys.stderr.write('%s %s [%d] at offset %d with key %s:\n' % (logtime,msg.topic(),msg.partition(),msg.offset(),type)) except KeyboardInterrupt: sys.stderr.write('%% Aborted by user\n') # Close down consumer to commit final offsets. c.close()
class KafkaConsumer: """Defines the base kafka consumer class""" def __init__( self, topic_name_pattern, message_handler, is_avro=True, offset_earliest=False, sleep_secs=1.0, consume_timeout=0.1, ): """Creates a consumer object for asynchronous use""" self.topic_name_pattern = topic_name_pattern self.message_handler = message_handler self.sleep_secs = sleep_secs self.consume_timeout = consume_timeout self.offset_earliest = offset_earliest # # # Done: Configure the broker properties below. Make sure to reference the project README # and use the Host URL for Kafka and Schema Registry! # # self.broker_properties = { 'BROKER_URL': 'localhost:9092', 'SCHEMA_REGISTRY_URL': 'localhost:8081', 'REST_PROXY': 'localhost:8082' } # Done: Create the Consumer, using the appropriate type. if is_avro is True: self.broker_properties["schema.registry.url"] = "http://localhost:8081" schema_registry = CachedSchemaRegistryClient( self.broker_properties["schema.registry.url"]) self.consumer = AvroConsumer( {"bootstrap.servers": self.broker_properties.get("BROKER_URL"), "group.id":f"{self.topic_name_pattern}"}, schema_registry = schema_registry) else: self.consumer = Consumer( {"bootstrap.servers": self.broker_properties.get("BROKER_URL"), "group.id": "0"}) # # # Done: Configure the AvroConsumer and subscribe to the topics. Make sure to think about # how the `on_assign` callback should be invoked. # # self.consumer.subscribe([f"^{self.topic_name_pattern}"], on_assign=self.on_assign) def on_assign(self, consumer, partitions): """Callback for when topic assignment takes place""" # Done: If the topic is configured to use `offset_earliest` set the partition offset to # the beginning or earliest logger.info("on_assign is incomplete - skipping") for partition in partitions: partition.offset = OFFSET_BEGINNING logger.info("partitions assigned for %s", self.topic_name_pattern) consumer.assign(partitions) async def consume(self): """Asynchronously consumes data from kafka topic""" while True: num_results = 1 while num_results > 0: num_results = self._consume() await gen.sleep(self.sleep_secs) def _consume(self): """Polls for a message. Returns 1 if a message was received, 0 otherwise""" # # # Done: Poll Kafka for messages. Make sure to handle any errors or exceptions. # Additionally, make sure you return 1 when a message is processed, and 0 when no message # is retrieved. # # message = self.consumer.poll(1.0) ret_code = 0 if message is None: logger.debug("no message received by consumer") ret_code = 0 elif message.error() is not None: logger.debug(f"error from consumer {message.error()}") ret_code = 0 else: logger.info(f"consumed meaage, {message.topic()}") #logger.info(f"consumed message, {message.key()}: {message.value()}") ret_code = 1 self.message_handler(message) #await asyncio.sleep(self.sleep_secs) #logger.info("_consume is incomplete - skipping") return ret_code def close(self): """Cleans up any open kafka consumers""" # # # Done: Cleanup the kafka consumer # # self.consumer.close()
#!/usr/bin/env python3 import uuid, os from confluent_kafka import Consumer consumer = Consumer({ 'bootstrap.servers': os.getenv('BOOTSTRAP_SERVERS'), 'sasl.mechanism': 'PLAIN', 'security.protocol': 'SASL_SSL', 'ssl.ca.location': 'probe', 'sasl.username': os.getenv('API_KEY'), 'sasl.password': os.getenv('API_SECRET'), 'group.id': str(uuid.uuid1()), 'auto.offset.reset': 'earliest' }) consumer.subscribe(['users']) try: while True: msg = consumer.poll(0.1) if msg is None: continue print('consumed: {}'.format(msg.value())) except KeyboardInterrupt: pass finally: consumer.close()
# Create Consumer instance # Hint: try debug='fetch' to generate some log messages c = Consumer(conf, logger=logger) def print_assignment(consumer, partitions): print('Assignment:', partitions) log.info("subscribing to the topic : " + str(topics)) # Subscribe to topics c.subscribe(topics, on_assign=print_assignment) log.info("Reading msg from the topic : " + str(topics)) # Read messages from Kafka, print to stdout try: while True: msg = c.poll(timeout=1.0) if msg is None: continue if msg.error(): raise KafkaException(msg.error()) else: # Proper message sys.stderr.write('%% %s [%d] at offset %d with key %s:\n' % (msg.topic(), msg.partition(), msg.offset(), str(msg.key()))) print(msg.value()) finally: # Close down consumer to commit final offsets. c.close()
consumer = Consumer({ 'bootstrap.servers': 'pkc-ep9mm.us-east-2.aws.confluent.cloud:9092', 'sasl.mechanism': 'PLAIN', 'security.protocol': 'SASL_SSL', 'sasl.username': '******', 'sasl.password': '******', 'group.id': str(uuid.uuid1()), # this will create a new consumer group on each invocation. 'auto.offset.reset': 'earliest' }) consumer.subscribe(['lecroy_files']) while True: msg = consumer.poll() chunk_queue.put(msg.value()) # Setup Kafka ##kafka_config = kcs.get_kafka_config('consumer') ##kafka_config['key_deserializer'] = kc.getKeyDeserializer() ##kafka_config['value_deserializer'] = kc.getValueDeserializer() ##if (not ('enable_auto_commit' in kafka_config.keys())) or kafka_config['enable_auto_commit'] != False: ## logging.warning('enable_auto_commit is not disabled. Consumer may mark messages written when they are not.') ##consumer = KafkaConsumer(**kafka_config) ##kafka_config = kcs.get_kafka_config('producer') ##kafka_config['key_serializer'] = kc.getKeySerializer() ##kafka_config['value_serializer'] = kc.getValueSerializer() ##producer = KafkaProducer(**kafka_config)
'bootstrap.servers': 'localhost:9092', 'group.id': 'my_group', 'default.topic.config': { 'auto.offset.reset': 'smallest' } } consumer = Consumer(**conf) topic = consumer.subscribe(['my-topic']) schema_path = "user.avsc" schema = avro.schema.Parse(open(schema_path).read()) try: running = True while running: msg = consumer.poll(timeout=60000) if msg is None: continue if msg.error(): if msg.error().code() == KafkaError._PARTITION_EOF: sys.stderr.write( '%% %s [%d] reached end at offset %d\n' % (msg.topic(), msg.partition(), msg.offset())) elif msg.error(): raise KafkaException(msg.error()) else: sys.stderr.write('%% %s [%d] at offset %d with key %s:\n' % (msg.topic(), msg.partition(), msg.offset(), str(msg.key())))
for stock in stock_list: # 存储 ticker 和 order_book 数据到 csv 文件 # 存储 ticker 和 order_book 数据到 influxdb 数据库 # ticker 在 partition 4 # ticker_sample_1 = '{"code":"HK.00386","time":"2018-09-03 15:59:50","price":7.67,"volume":4000,"turnover":30680.0,"ticker_direction":"BUY","sequence":6596904796962160642,"type":"AUTO_MATCH"}' # ticker_sample_2 = '{"code":"HK.00386","time":"2018-09-03 15:59:50","price":7.67,"volume":2000,"turnover":15340.0,"ticker_direction":"BUY","sequence":6596904796962160644,"type":"AUTO_MATCH"}' # ticker_sample_3 = '{"code":"HK.00386","time":"2018-09-03 15:59:51","price":7.67,"volume":2000,"turnover":15340.0,"ticker_direction":"BUY","sequence":6596904801257127938,"type":"AUTO_MATCH"}' consumer.assign([TopicPartition(stock, 4, 0)]) consumer.seek(TopicPartition(stock, 4, 0)) ticker_pd = pd.DataFrame(columns=[ 'code', 'time', 'price', 'volume', 'turnover', 'ticker_direction', 'sequence', 'type' ]) while True: msg = consumer.poll(3.0) if msg is None: continue if msg.error(): if msg.error().code() == KafkaError._PARTITION_EOF: break else: print(msg.error()) break # print('Received message: {}'.format(msg.value().decode('utf-8'))) msg_json = json.loads(msg.value()) ticker_pd = ticker_pd.append([msg_json], ignore_index=True) # print(ticker_pd) ticker_pd.to_csv( "/home/liugdft/stock/ticker-orderbook-sample/2018.09.03/" + stock + "-ticker-20180903.csv")
def consumer(args, poll_timeout=3.0): """ Consumes packets from a Kafka topic. """ # setup the signal handler signal.signal(signal.SIGINT, signal_handler) # where to start consuming messages from kafka_offset_options = { "begin": seek_to_begin, "end": seek_to_end, "stored": seek_to_stored } on_assign_cb = kafka_offset_options[args.kafka_offset] # connect to kafka logging.debug("Connecting to Kafka; %s", args.kafka_configs) kafka_consumer = Consumer(args.kafka_configs) kafka_consumer.subscribe([args.kafka_topic], on_assign=on_assign_cb) # if 'pretty-print' not set, write libpcap global header if args.pretty_print == 0: sys.stdout.write(global_header(args)) sys.stdout.flush() try: pkts_in = 0 while not finished.is_set() and (args.max_packets <= 0 or pkts_in < args.max_packets): # consume a message from kafka msg = kafka_consumer.poll(timeout=poll_timeout) if msg is None: # no message received continue; elif msg.error(): if msg.error().code() == KafkaError._PARTITION_EOF: if args.pretty_print > 0: print "Reached end of topar: topic=%s, partition=%d, offset=%s" % ( msg.topic(), msg.partition(), msg.offset()) else: raise KafkaException(msg.error()) else: pkts_in += 1 logging.debug("Packet received: pkts_in=%d", pkts_in) if args.pretty_print == 0: # write the packet header and packet sys.stdout.write(packet_header(msg)) sys.stdout.write(msg.value()) sys.stdout.flush() elif pkts_in % args.pretty_print == 0: # pretty print print 'Packet[%s]: date=%s topic=%s partition=%s offset=%s len=%s' % ( pkts_in, to_date(unpack_ts(msg.key())), args.kafka_topic, msg.partition(), msg.offset(), len(msg.value())) finally: sys.stdout.close() kafka_consumer.close()
obj = SwiftUploadObject( source=BytesIO(stamp_dict['stampData']), object_name=stamp_dict['fileName']) except TypeError: print('%% Cannot get stamp\n') return obj print ("Start") with SwiftService(options=_opts) as swift, OutputManager() as out_manager: c = Consumer(settings) c.subscribe([topic]) try: while True: msg = c.poll(0.1) if msg is None: continue elif not msg.error(): decoded_msg = fastavro.reader(BytesIO(msg.value())) for alert in decoded_msg: objs = [] objs.append(make_upload_object(alert.get('cutoutDifference'))) objs.append(make_upload_object(alert.get('cutoutTemplate'))) objs.append(make_upload_object(alert.get('cutoutScience'))) container = topic for r in swift.upload(container, objs): if r['success']: if 'object' in r: logger.debug( "uploaded object: %s" % (r['object']))
if __name__ == '__main__': from confluent_kafka import Consumer, KafkaError # 'enable.partition.eof': False # https://github.com/confluentinc/confluent-kafka-python/issues/283 # https://github.com/confluentinc/confluent-kafka-python/issues/176 # https://github.com/edenhill/librdkafka/issues/1024 c = Consumer({'bootstrap.servers': '<kafka server>', 'group.id': 'mygroup', 'enable.partition.eof': False, 'default.topic.config': {'auto.offset.reset': 'smallest'}}) c.subscribe(['<topic>']) running = True while running: msg = c.poll() if not msg.error(): print('Received message: %s' % msg.value().decode('utf-8')) elif msg.error().code() != KafkaError._PARTITION_EOF: print(msg.error()) running = False c.close()
} # 步驟2. 產生一個Kafka的Consumer的實例 consumer = Consumer(props) # 步驟3. 指定想要訂閱訊息的topic名稱 topicName = "test2"; # 步驟4. 讓Consumer向Kafka集群訂閱指定的topic consumer.subscribe([topicName]) # 步驟5. 持續的拉取Kafka有進來的訊息 try: while True: # 請求Kafka把新的訊息吐出來 record = consumer.poll(timeout=1.0) # 逐筆的取回訊息 # 檢查是否有錯誤 if record is None: continue if record.error(): # Error or event if record.error().code() == KafkaError._PARTITION_EOF: # End of partition event sys.stderr.write('%% %s [%d] reached end at offset %d\n' % (record.topic(), record.partition(), record.offset())) else: # Error raise KafkaException(record.error()) else: # ** 在這裡進行商業邏輯與訊息處理 **
Print published run information from Kafka stream """ def parseMessage(buf): buf = bytearray(buf) runInfo = ISISStream.RunInfo.RunInfo.GetRootAsRunInfo(buf, 0) start_time = datetime.datetime.fromtimestamp(runInfo.StartTime()).strftime('%Y-%m-%d %H:%M:%S') string_to_print = "Run number: " + str(runInfo.RunNumber()) + \ ", Start time: " + start_time + \ ", Instrument name: " + runInfo.InstName() + \ ", Stream offset: " + str(runInfo.StreamOffset()) print string_to_print if __name__ == "__main__": c = Consumer({'bootstrap.servers': 'sakura', 'group.id': 'python-read-run-info', 'default.topic.config': {'auto.offset.reset': 'smallest'}, 'enable.auto.commit': False}) c.subscribe(['test_run_topic']) running = True while running: msg = c.poll(1000) if not msg.error(): parseMessage(msg.value()) elif msg.error().code() != KafkaError._PARTITION_EOF: print(msg.error()) running = False else: running = False c.close()
class ConfluentKafkaReader(object): def __init__(self, host, port, group, topic, buffer_size, reconnect_wait_time=2): """ Initialize Kafka reader """ logging.info("Initializing Confluent Kafka Consumer") self.host = host self.port = str(port) self.group = group self.topic = [topic] self.buffer_size = buffer_size self.reconnect_wait_time = reconnect_wait_time self.reconnect_retries = 0 self.max_reconnect_retries = 10 # TODO: implement config parameter self.buffer = [] # Initialized on read self.consumer = None def on_assign(self, consumer, partitions): # for p in partitions: # p.offset=-2 # consumer.assign(partitions) logging.debug('on_assignment callback...') logging.info('Assignment:', partitions) def _connect(self): connection = {'bootstrap.servers': self.host+":"+self.port, 'group.id': self.group, 'session.timeout.ms': 6000, 'default.topic.config': {'auto.offset.reset': 'largest'}} logging.info("Connecting to Kafka at %s...", connection) self.consumer = Consumer(**connection) self.consumer.subscribe(self.topic, on_assign=self.on_assign) def read(self): """ Read from Kafka. Reconnect on error. """ try: self._connect() msgcn = 0 while True: msg = self.consumer.poll(timeout=1.0) if msg is None: continue if msg.error(): # Error or event if msg.error().code() == KafkaError._PARTITION_EOF: # End of partition event logging.debug('Catching KafkaError._PARTITION_EOF') logging.error('%s [%d] reached end at offset %d\n', msg.topic(), msg.partition(), msg.offset()) logging.error('%s [%d] at offset %d with key %s:\n', msg.topic(), msg.partition(), msg.offset(), str(msg.key())) break elif msg.error(): # Error # TODO : extend exception handling scope as we will end here # for a lot of reasons ! logging.debug('Catching other errors...') logging.error("Kafka error: %s.", msg.error()) logging.error("Trying to reconnect to %s:%s", self.host, self.port) self.reconnect_retries += 1 time.sleep(self.reconnect_wait_time) if self.reconnect_retries >= self.max_reconnect_retries: logging.error("Max reconnection attempt limit reached (%d). Aborting", self.max_reconnect_retries) break else: self.consumer.close() self._connect() pass #raise KafkaException(msg.error()) else: # Proper message logging.error('%s [%d] at offset %d with key %s:\n', msg.topic(), msg.partition(), msg.offset(), str(msg.key())) (self.buffer).append(msg.value().rstrip('\n')) # otherwise the #writter will add extra \n msgcn += 1 #self.consumer.commit(async=False) if msgcn >= self.buffer_size: logging.debug("Read buffer [%d] reached.",self.buffer_size) break except KeyboardInterrupt: logging.info('Aborted by user\n') # Close down consumer to commit final offsets. self.consumer.close() return(self.buffer)
'security.protocol': 'SASL_SSL', 'sasl.username': conf['sasl.username'], 'sasl.password': conf['sasl.password'], 'group.id': 'python_example_group_1', 'auto.offset.reset': 'earliest' }) # Subscribe to topic c.subscribe([topic]) # Process messages total_count = 0 try: while True: print("Waiting for message or event/error in poll()") msg = c.poll(1.0) if msg is None: # No message available within timeout. # Initial message consumption may take up to # `session.timeout.ms` for the consumer group to # rebalance and start consuming continue elif not msg.error(): # Check for Kafka message record_key = msg.key() record_value = msg.value() data = json.loads(record_value) count = data['count'] total_count += count print("Consumed record with key {} and value {}, \ and updated total count to {}"
class KafkaConsumer: """Defines the base kafka consumer class""" def __init__( self, topic_name_pattern, message_handler, is_avro=True, offset_earliest=False, sleep_secs=1.0, consume_timeout=0.1, ): """Creates a consumer object for asynchronous use""" self.topic_name_pattern = topic_name_pattern self.message_handler = message_handler self.sleep_secs = sleep_secs self.consume_timeout = consume_timeout self.offset_earliest = offset_earliest # TODO: Configure the broker properties below. Make sure to reference the project README # and use the Host URL for Kafka and Schema Registry! self.broker_properties = { "bootstrap.servers": ",".join(["PLAINTEXT://localhost:9092"]), "group.id": f"{topic_name_pattern}", "default.topic.config": {"auto.offset.reset": "earliest"}, } # TODO: Create the Consumer, using the appropriate type. if is_avro is True: self.broker_properties["schema.registry.url"] = "http://localhost:8081" self.consumer = AvroConsumer(self.broker_properties) else: self.consumer = Consumer(self.broker_properties) # TODO: Configure the AvroConsumer and subscribe to the topics. Make sure to think about # how the `on_assign` callback should be invoked. self.consumer.subscribe([self.topic_name_pattern], on_assign=self.on_assign) def on_assign(self, consumer, partitions): """Callback for when topic assignment takes place""" # TODO: If the topic is configured to use `offset_earliest` set the partition offset to # the beginning or earliest for partition in partitions: if self.offset_earliest is True: logger.debug( f"setting partitions to earliest for {self.topic_name_pattern}" ) logger.debug(f"before: {partition}") partition.offset = confluent_kafka.OFFSET_BEGINNING logger.debug(f"after: {partition}") logger.info(f"partitions assigned for {self.topic_name_pattern}") # TODO: Assign the consumer the partitions consumer.assign(partitions) async def consume(self): """Asynchronously consumes data from kafka topic""" while True: num_results = 1 while num_results > 0: num_results = self._consume() await gen.sleep(self.sleep_secs) def _consume(self): """Polls for a message. Returns 1 if a message was received, 0 otherwise""" # TODO: Poll Kafka for messages. Make sure to handle any errors or exceptions. # Additionally, make sure you return 1 when a message is processed, and 0 when no message # is retrieved. logger.debug(f"consuming from topic pattern {self.topic_name_pattern}") try: message = self.consumer.poll(timeout=self.consume_timeout) except SerializerError as e: logger.error( f"failed to deserialize message {self.topic_name_pattern}: {e}" ) return 0 if message is None: logger.debug("no messages to be consumed") return 0 elif message.error() is not None: logger.error( f"failed to consume message {self.topic_name_pattern}: {message.error()}" ) return 0 logger.debug(f"message received: ({message.key()}) {message.value()}") self.message_handler(message) return 1 def close(self): """Cleans up any open kafka consumers""" # TODO: Cleanup the kafka consumer logger.debug("closing consumer...") self.consumer.close()
} c = Consumer(settings) c.subscribe([topic]) # write_to_file(log_name,'Start Receiving Message >>>>>>') write_to_file(log_name, '') p = Producer({ 'bootstrap.servers': kafka_ip + ':9092', 'message.max.bytes': '1100000' }) try: while True: msg = c.poll() if msg is None: continue elif not msg.error(): consume_time = time.time() print( "Receive message: (key={} msg size={}) from topic: {} at time: {}" .format(msg.key(), len(msg.value()), topic, consume_time)) append_to_file( log_name, "Receive message: (key={} msg size={}) from topic: {} at time: {}" .format(msg.key(), len(msg.value()), topic, consume_time)) if (message_forwarding == 'True'): # produce_msg(p,forwarding_topic,msg.key(),msg.value(),log_name) p.produce(forwarding_topic, key=msg.key(), value=msg.value()) p.flush()
class KafkaConsumer: """Defines the base kafka consumer class""" def __init__( self, topic_name_pattern, message_handler, is_avro=True, offset_earliest=False, sleep_secs=1.0, consume_timeout=0.1, ): """Creates a consumer object for asynchronous use""" self.topic_name_pattern = topic_name_pattern self.message_handler = message_handler self.sleep_secs = sleep_secs self.consume_timeout = consume_timeout self.offset_earliest = offset_earliest self.broker_properties = { "bootstrap.servers": "PLAINTEXT://localhost:9092", "group.id": f"{topic_name_pattern}", "default.topic.config": { "auto.offset.reset": "earliest" }, } if is_avro is True: self.broker_properties[ "schema.registry.url"] = "http://localhost:8081" self.consumer = AvroConsumer(self.broker_properties) else: self.consumer = Consumer(self.broker_properties) self.consumer.subscribe([self.topic_name_pattern], on_assign=self.on_assign) def on_assign(self, consumer, partitions): # print("partition assign") for partition in partitions: if self.offset_earliest is True: partition.offset = confluent_kafka.OFFSET_BEGINNING consumer.assign(partitions) async def consume(self): """Asynchronously consumes data from kafka topic""" while True: num_results = 1 while num_results > 0: num_results = self._consume() await gen.sleep(self.sleep_secs) def _consume(self): message = self.consumer.poll(timeout=self.consume_timeout) if message is None: # print("== message is None") return 0 elif message.error() is not None: # print("== message is error") return 0 # print("== key:",message.key()) # print("== value:",message.value()) self.message_handler(message) return 1 def close(self): # print("== comsumer close") self.consumer.close()
def test_basic_api(): """ Basic API tests, these wont really do anything since there is no broker configured. """ try: kc = Consumer() except TypeError as e: assert str(e) == "expected configuration dict" def dummy_commit_cb(err, partitions): pass kc = Consumer({'group.id': 'test', 'socket.timeout.ms': '100', 'session.timeout.ms': 1000, # Avoid close() blocking too long 'on_commit': dummy_commit_cb}) kc.subscribe(["test"]) kc.unsubscribe() def dummy_assign_revoke(consumer, partitions): pass kc.subscribe(["test"], on_assign=dummy_assign_revoke, on_revoke=dummy_assign_revoke) kc.unsubscribe() msg = kc.poll(timeout=0.001) if msg is None: print('OK: poll() timeout') elif msg.error(): print('OK: consumer error: %s' % msg.error().str()) else: print('OK: consumed message') if msg is not None: assert msg.timestamp() == (TIMESTAMP_NOT_AVAILABLE, -1) msglist = kc.consume(num_messages=10, timeout=0.001) assert len(msglist) == 0, "expected 0 messages, not %d" % len(msglist) with pytest.raises(ValueError) as ex: kc.consume(-100) assert 'num_messages must be between 0 and 1000000 (1M)' == str(ex.value) with pytest.raises(ValueError) as ex: kc.consume(1000001) assert 'num_messages must be between 0 and 1000000 (1M)' == str(ex.value) partitions = list(map(lambda part: TopicPartition("test", part), range(0, 100, 3))) kc.assign(partitions) with pytest.raises(KafkaException) as ex: kc.seek(TopicPartition("test", 0, 123)) assert 'Erroneous state' in str(ex.value) # Verify assignment assignment = kc.assignment() assert partitions == assignment # Pause partitions kc.pause(partitions) # Resume partitions kc.resume(partitions) # Get cached watermarks, should all be invalid. lo, hi = kc.get_watermark_offsets(partitions[0], cached=True) assert lo == -1001 and hi == -1001 assert lo == OFFSET_INVALID and hi == OFFSET_INVALID # Query broker for watermarks, should raise an exception. try: lo, hi = kc.get_watermark_offsets(partitions[0], timeout=0.5, cached=False) except KafkaException as e: assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._WAIT_COORD, KafkaError.LEADER_NOT_AVAILABLE),\ str(e.args([0])) kc.unassign() kc.commit(asynchronous=True) try: kc.commit(asynchronous=False) except KafkaException as e: assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._NO_OFFSET) # Get current position, should all be invalid. kc.position(partitions) assert len([p for p in partitions if p.offset == OFFSET_INVALID]) == len(partitions) try: kc.committed(partitions, timeout=0.001) except KafkaException as e: assert e.args[0].code() == KafkaError._TIMED_OUT try: kc.list_topics(timeout=0.2) except KafkaException as e: assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._TRANSPORT) try: kc.list_topics(topic="hi", timeout=0.1) except KafkaException as e: assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._TRANSPORT) kc.close()
class KafkaConsumer: """Defines the base kafka consumer class""" def __init__( self, topic_name_pattern, message_handler, is_avro=True, offset_earliest=False, sleep_secs=1.0, consume_timeout=0.1, ): """Creates a consumer object for asynchronous use""" self.topic_name_pattern = topic_name_pattern self.message_handler = message_handler self.sleep_secs = sleep_secs self.consume_timeout = consume_timeout self.offset_earliest = offset_earliest self.broker_properties = { "bootstrap.servers": BROKER_URL, "group.id": GROUP_ID } # TODO - NEEDED? # if offset_earliest: # self.broker_properties['auto.offset.reset'] = 'earliest' if is_avro is True: self.broker_properties["schema.registry.url"] = SCHEMA_REGISTRY_URL self.consumer = AvroConsumer(self.broker_properties) else: self.consumer = Consumer(self.broker_properties) self.consumer.subscribe([topic_name_pattern], on_assign=self.on_assign) def on_assign(self, consumer, partitions): """Callback for when topic assignment takes place""" if self.offset_earliest: for partition in partitions: partition.offset = confluent_kafka.OFFSET_BEGINNING logger.info("partitions assigned for %s", self.topic_name_pattern) consumer.assign(partitions) async def consume(self): """Asynchronously consumes data from kafka topic""" while True: num_results = 1 while num_results > 0: num_results = self._consume() await gen.sleep(self.sleep_secs) def _consume(self): """Polls for a message. Returns 1 if a message was received, 0 otherwise""" try: message = self.consumer.poll(self.consume_timeout) if message is None or message.error(): return 0 else: self.message_handler(message) return 1 except Exception as e: logger.error(f"Failed to consume message: {e}") def close(self): """Cleans up any open kafka consumers""" self.consumer.close()
# 'sasl.mechanisms': conf['sasl.mechanisms'], # 'security.protocol': conf['security.protocol'], # 'sasl.username': conf['sasl.username'], # 'sasl.password': conf['sasl.password'], 'group.id': 'python_example_group_1', 'auto.offset.reset': 'earliest', }) # Subscribe to topic c.subscribe([topic]) # Process messages total_count = 0 try: while True: msg = c.poll(1.0) if msg is None: # No message available within timeout. # Initial message consumption may take up to # `session.timeout.ms` for the consumer group to # rebalance and start consuming print("Waiting for message or event/error in poll()") continue elif msg.error(): print('error: {}'.format(msg.error())) else: # Check for Kafka message record_key = msg.key() record_value = msg.value() data = json.loads(record_value) count = data['count']
def run(self): consumer = Consumer(self.config) callbacks = collections.defaultdict(list) functions = self.functions def close(): log.info('Closing consumer') consumer.close() atexit.register(close) while True: add, update, remove = functions.refresh() if add or update or remove: existing_topics = set(callbacks.keys()) for f in add: callbacks[functions.arguments(f).get('topic')].append(f) for f in update: pass for f in remove: callbacks[functions.arguments(f).get('topic')].remove(f) interested_topics = set(callbacks.keys()) if existing_topics.symmetric_difference(interested_topics): log.debug(f'Subscribing to {interested_topics}') consumer.subscribe(list(interested_topics)) log.debug('Before polling...') message = consumer.poll(timeout=functions.refresh_interval) log.debug('After polling...') if not message: log.debug('Empty message received') elif not message.error(): log.debug('Key:' + str(message.key()) + ' Value:' + str(message.value())) topic, key, value = message.topic(), \ message.key(), \ message.value() try: key = message.key().decode('utf-8') except: pass try: value = json.loads(value) except: pass for function in callbacks[topic]: jq_filter = functions.arguments(function).get('filter') try: if jq_filter and not pyjq.first(jq_filter, value): continue except: log.error( f'Could not filter message value with {jq_filter}') data = self.function_data(function, topic, key, value) log.debug('Invoking a function with data:' + str(data)) functions.gateway.post(functions._gateway_base + f'/function/{function["name"]}', data=data) log.debug('Function: ' + f'/function/{function["name"]}' + ' Data:' + str(data)) # if auto commit not enabled, manually commit the messages if not bool(os.getenv('ENABLE_AUTO_COMMIT', 'True')): current_msg = message consumer.commit(message=current_msg, async=False)
logger.addHandler(handler) # Create Consumer instance # Hint: try debug='fetch' to generate some log messages c = Consumer(conf, logger=logger) def print_assignment(consumer, partitions): print('Assignment:', partitions) # Subscribe to topics c.subscribe(topics, on_assign=print_assignment) # Read messages from Kafka, print to stdout try: while True: msg = c.poll(timeout=1.0) if msg is None: continue if msg.error(): raise KafkaException(msg.error()) else: # Proper message sys.stderr.write('%% %s [%d] at offset %d with key %s:\n' % (msg.topic(), msg.partition(), msg.offset(), str(msg.key()))) print(msg.value()) except KeyboardInterrupt: sys.stderr.write('%% Aborted by user\n') finally:
class KafkaConsumer: """Defines the base kafka consumer class""" def __init__( self, topic_name_pattern, message_handler, is_avro=True, offset_earliest=False, sleep_secs=1.0, consume_timeout=0.1, ): """Creates a consumer object for asynchronous use""" self.topic_name_pattern = topic_name_pattern self.message_handler = message_handler self.sleep_secs = sleep_secs self.consume_timeout = consume_timeout self.offset_earliest = offset_earliest self.broker_properties = { "group.id": "consumer_group", "bootstrap.servers": "PLAINTEXT://localhost:9092", "auto.offset.reset": "earliest" } if is_avro is True: self.broker_properties[ "schema.registry.url"] = "http://localhost:8081" self.consumer = AvroConsumer(self.broker_properties) else: self.consumer = Consumer(self.broker_properties) pass self.consumer.subscribe([self.topic_name_pattern], on_assign=self.on_assign) def on_assign(self, consumer, partitions): """Callback for when topic assignment takes place""" logger.info("on_assign is incomplete - skipping") for partition in partitions: if self.offset_earliest is True: partition.offset = confluent_kafka.OFFSET_BEGINNING logger.info("partitions assigned for %s", self.topic_name_pattern) consumer.assign(partitions) async def consume(self): """Asynchronously consumes data from kafka topic""" while True: num_results = 1 while num_results > 0: num_results = self._consume() await gen.sleep(self.sleep_secs) def _consume(self): """Polls for a message. Returns 1 if a message was received, 0 otherwise""" while True: message = self.consumer.poll(1.0) if message is 0: print(f"error from consumer {message.error()}") elif message.error() is not None: print(f"error from consumer: {message.error()}") else: print( f"consumed message with {message.key()} and value {message.value()}" ) logger.info("consumer completed") return 0 def close(self): """Cleans up any open kafka consumers""" logger.debug("closing consumer") self.consumer.close()
def run_commit_log_consumer(bootstrap_servers, consumer_group, commit_log_topic, partition_state_manager, synchronize_commit_group, start_event, stop_request_event): start_event.set() logging.debug('Starting commit log consumer...') positions = {} # NOTE: The commit log consumer group should not be persisted into the # ``__consumer_offsets`` topic since no offsets are committed by this # consumer. The group membership metadata messages will be published # initially but as long as this group remains a single consumer it will # be deleted after the consumer is closed. # It is very important to note that the ``group.id`` **MUST** be unique to # this consumer process!!! This ensures that it is able to consume from all # partitions of the commit log topic and get a comprehensive view of the # state of the consumer groups it is tracking. consumer = Consumer({ 'bootstrap.servers': bootstrap_servers, 'group.id': consumer_group, 'enable.auto.commit': 'false', 'enable.auto.offset.store': 'true', 'enable.partition.eof': 'false', 'default.topic.config': { 'auto.offset.reset': 'error', }, }) def rewind_partitions_on_assignment(consumer, assignment): # The commit log consumer must start consuming from the beginning of # the commit log topic to ensure that it has a comprehensive view of # all active partitions. consumer.assign([ TopicPartition( i.topic, i.partition, positions.get((i.topic, i.partition), OFFSET_BEGINNING), ) for i in assignment ]) consumer.subscribe( [commit_log_topic], on_assign=rewind_partitions_on_assignment, ) while not stop_request_event.is_set(): message = consumer.poll(1) if message is None: continue error = message.error() if error is not None: raise Exception(error) positions[(message.topic(), message.partition())] = message.offset() + 1 group, topic, partition, offset = get_commit_data(message) if group != synchronize_commit_group: logger.debug('Received consumer offsets update from %r, ignoring...', group) continue if offset in LOGICAL_OFFSETS: logger.debug( 'Skipping invalid logical offset (%r) from %s/%s...', offset, topic, partition) continue elif offset < 0: logger.warning( 'Received unexpected negative offset (%r) from %s/%s!', offset, topic, partition) partition_state_manager.set_remote_offset(topic, partition, offset)
class SynchronizedConsumer(object): """ This class implements the framework for a consumer that is intended to only consume messages that have already been consumed and committed by members of another consumer group. This works similarly to the Kafka built-in ``__consumer_offsets`` topic. The consumer group that is being "followed" (the one that must make progress for our consumer here to make progress, identified by the ``synchronize_commit_group`` constructor parameter/instance attribute) must report its offsets to a topic (identified by the ``commit_log_topic`` constructor parameter/instance attribute). This consumer subscribes to both commit log topic, as well as the topic(s) that we are actually interested in consuming messages from. The messages received from the commit log topic control whether or not consumption from partitions belonging to the main topic is paused, resumed, or allowed to continue in its current state without changes. The furthest point in any partition that this consumer should ever consume to is the maximum offset that has been recorded to the commit log topic for that partition. If the offsets recorded to that topic move non-monotonically (due to an intentional offset rollback, for instance) this consumer *may* consume up to the highest watermark point. (The implementation here tries to pause consuming from the partition as soon as possible, but this makes no explicit guarantees about that behavior.) """ initial_offset_reset_strategies = { 'earliest': get_earliest_offset, 'latest': get_latest_offset, } def __init__(self, bootstrap_servers, consumer_group, commit_log_topic, synchronize_commit_group, initial_offset_reset='latest', on_commit=None): self.bootstrap_servers = bootstrap_servers self.consumer_group = consumer_group self.commit_log_topic = commit_log_topic self.synchronize_commit_group = synchronize_commit_group self.initial_offset_reset = self.initial_offset_reset_strategies[initial_offset_reset] self.__partition_state_manager = SynchronizedPartitionStateManager( self.__on_partition_state_change) self.__commit_log_consumer, self.__commit_log_consumer_stop_request = self.__start_commit_log_consumer() self.__positions = {} def commit_callback(error, partitions): if on_commit is not None: return on_commit(error, partitions) consumer_configuration = { 'bootstrap.servers': self.bootstrap_servers, 'group.id': self.consumer_group, 'enable.auto.commit': 'false', 'enable.auto.offset.store': 'true', 'enable.partition.eof': 'false', 'default.topic.config': { 'auto.offset.reset': 'error', }, 'on_commit': commit_callback, } self.__consumer = Consumer(consumer_configuration) def __start_commit_log_consumer(self, timeout=None): """ Starts running the commit log consumer. """ stop_request_event = threading.Event() start_event = threading.Event() result = execute( functools.partial( run_commit_log_consumer, bootstrap_servers=self.bootstrap_servers, consumer_group='{}:sync:{}'.format(self.consumer_group, uuid.uuid1().hex), commit_log_topic=self.commit_log_topic, synchronize_commit_group=self.synchronize_commit_group, partition_state_manager=self.__partition_state_manager, start_event=start_event, stop_request_event=stop_request_event, ), ) start_event.wait(timeout) return result, stop_request_event def __check_commit_log_consumer_running(self): if not self.__commit_log_consumer.running(): try: result = self.__commit_log_consumer.result(timeout=0) # noqa except TimeoutError: pass # not helpful raise Exception('Commit log consumer unexpectedly exit!') def __on_partition_state_change( self, topic, partition, previous_state_and_offsets, current_state_and_offsets): """ Callback that is invoked when a partition state changes. """ logger.debug('State change for %r: %r to %r', (topic, partition), previous_state_and_offsets, current_state_and_offsets) current_state, current_offsets = current_state_and_offsets if current_offsets.local is None: # It only makes sense to manipulate the consumer if we've got an # assignment. (This block should only be entered at startup if the # remote offsets are retrieved from the commit log before the local # consumer has received its assignment.) return # TODO: This will be called from the commit log consumer thread, so need # to verify that calling the ``consumer.{pause,resume}`` methods is # thread safe! if current_state in (SynchronizedPartitionState.UNKNOWN, SynchronizedPartitionState.SYNCHRONIZED, SynchronizedPartitionState.REMOTE_BEHIND): self.__consumer.pause([TopicPartition(topic, partition, current_offsets.local)]) elif current_state is SynchronizedPartitionState.LOCAL_BEHIND: self.__consumer.resume([TopicPartition(topic, partition, current_offsets.local)]) else: raise NotImplementedError('Unexpected partition state: %s' % (current_state,)) def subscribe(self, topics, on_assign=None, on_revoke=None): """ Subscribe to a topic. """ self.__check_commit_log_consumer_running() def assignment_callback(consumer, assignment): # Since ``auto.offset.reset`` is set to ``error`` to force human # interaction on an offset reset, we have to explicitly specify the # starting offset if no offset has been committed for this topic during # the ``__consumer_offsets`` topic retention period. assignment = { (i.topic, i.partition): self.__positions.get((i.topic, i.partition)) for i in assignment } for i in self.__consumer.committed([TopicPartition(topic, partition) for ( topic, partition), offset in assignment.items() if offset is None]): k = (i.topic, i.partition) if i.offset > -1: assignment[k] = i.offset else: assignment[k] = self.initial_offset_reset(consumer, i.topic, i.partition) self.__consumer.assign([TopicPartition(topic, partition, offset) for (topic, partition), offset in assignment.items()]) for (topic, partition), offset in assignment.items(): # Setting the local offsets will either cause the partition to be # paused (if the remote offset is unknown or the local offset is # not trailing the remote offset) or resumed. self.__partition_state_manager.set_local_offset(topic, partition, offset) self.__positions[(topic, partition)] = offset if on_assign is not None: on_assign(self, [TopicPartition(topic, partition) for topic, partition in assignment.keys()]) def revocation_callback(consumer, assignment): for item in assignment: # TODO: This should probably also be removed from the state manager. self.__positions.pop((item.topic, item.partition)) if on_revoke is not None: on_revoke(self, assignment) self.__consumer.subscribe( topics, on_assign=assignment_callback, on_revoke=revocation_callback) def poll(self, timeout): self.__check_commit_log_consumer_running() message = self.__consumer.poll(timeout) if message is None: return if message.error() is not None: return message self.__partition_state_manager.validate_local_message( message.topic(), message.partition(), message.offset()) self.__partition_state_manager.set_local_offset( message.topic(), message.partition(), message.offset() + 1) self.__positions[(message.topic(), message.partition())] = message.offset() + 1 return message def commit(self, *args, **kwargs): self.__check_commit_log_consumer_running() return self.__consumer.commit(*args, **kwargs) def close(self): self.__check_commit_log_consumer_running() self.__commit_log_consumer_stop_request.set() try: self.__consumer.close() finally: self.__commit_log_consumer.result()
def httpry_logs(): consumer = Consumer({'bootstrap.servers': kafka_hosts, 'group.id': 'Httpry_logs_%s' %dt,'default.topic.config': {'auto.offset.reset': 'latest','auto.commit.enable':'true'}}) consumer.subscribe(['httpry_logs']) try: while True: msg = consumer.poll() if msg: if not msg.error(): Msg = msg.value().decode('utf-8').strip() try: tm = time.strftime('%Y%m%d%H%M', time.localtime()) httpry_Key = 'httpry_domain.%s' % tm if Msg: msg = Msg.split() if len(msg) == 11: if msg[6] != '-': RC.zincrby(httpry_Key,msg[6], 1) RC.expire(httpry_Key,600) except Exception as e: logging.error(e) continue elif msg.error().code() != KafkaError._PARTITION_EOF: logging.error(msg.error()) continue except Exception as e: logging.error(e) finally: consumer.close()
def analytics_internet2_logs(): consumer = Consumer({'bootstrap.servers': kafka_hosts, 'group.id': 'Internet2_logs_%s' %dt,'default.topic.config': {'auto.offset.reset': 'latest','auto.commit.enable':'true'}}) consumer.subscribe(['haproxy_logs']) try: while True: msg = consumer.poll() if not msg.error(): Msg = msg.value().decode('utf-8').strip() try: tt = time.strftime('%Y%m%d', time.localtime()) tm = time.strftime('%Y%m%d%H%M', time.localtime()) Tm = time.strftime('%H:%M', time.localtime()) Tra_ser_minute_Key = 'traffic.ser.%s' % tm Tra_cli_minute_Key = 'traffic.cli.%s' % tm if Msg: Msg = Msg.split() if len(Msg) >= 17: traffic_cli = Msg[10] traffic_ser = Msg[11] Topic = str(Msg[14]).split('|')[0].replace('{', '').strip() IP = str(Msg[5]) Rtime = Msg[8].split('/')[-1] if Rtime.isdigit(): Rtime = int(Rtime) else: Rtime = 0 uv_key = 'baihe_uv_%s' % tt Rt_Key = 'Rtime_%s_%s' % (tt, Topic) PATH = str(Msg[16]).split('?')[0] URL = 'http://%s%s' % (Topic,PATH) Tra_ser_url_minute_Key = 'traffic.ser.url_%s' % Tm Tra_cli_url_minute_Key = 'traffic.cli.url_%s' % Tm for KEY in (uv_key,Rt_Key,Tra_ser_url_minute_Key,Tra_cli_url_minute_Key): RC.expire(KEY,3600) # 流量 if traffic_ser.isdigit() and traffic_cli.isdigit(): RC.zincrby(Tra_cli_url_minute_Key, URL, int(traffic_cli)) RC.zincrby(Tra_ser_url_minute_Key,URL, int(traffic_ser)) # 实时流量 RC.zincrby(Tra_cli_minute_Key, Topic, int(traffic_cli)) RC.expire(Tra_cli_minute_Key, 300) RC.zincrby(Tra_ser_minute_Key, Topic, int(traffic_ser)) RC.expire(Tra_ser_minute_Key, 300) # if Rtime: RC.lpush(Rt_Key, Rtime) RC.sadd(uv_key, IP) except Exception as e: logging.error(e) continue elif msg.error().code() != KafkaError._PARTITION_EOF: logging.error(msg.error()) continue except Exception as e: logging.error(e) finally: consumer.close()