def __seek_from_to_offsets(self, partition, start_offset, end_offset, fft): self.log.info( f'Start : __seek_from_to_offsets({partition}, {start_offset}, {end_offset})' ) consumer = AvroConsumer({ 'bootstrap.servers': self.bootstrap_servers, 'group.id': self.group_id, 'schema.registry.url': self.schema_registry_url }) topic_partition = TopicPartition(self.topic, partition) topic_partition.offset = start_offset consumer.assign([topic_partition]) messages = [] while True: message = consumer.poll(10) if fft: dasfft = DasFft() message.value()['fft'] = dasfft.amplitudes_fft( message.value()['amplitudes']) messages.append(message) if (message.offset() >= end_offset): self.log.info( f'End : __seek_from_to_offsets({partition}, {start_offset}, {end_offset})' ) return messages
def __get_message(self, partition, offset, fft): self.log.info(f'Start : __get_message({partition},{offset})') consumer = AvroConsumer({ 'bootstrap.servers': self.bootstrap_servers, 'group.id': self.group_id, 'schema.registry.url': self.schema_registry_url }) topic_partition = TopicPartition(self.topic, partition) topic_partition.offset = offset consumer.assign([topic_partition]) message = consumer.poll(10) consumer.close() if fft: dasfft = DasFft() message.value()['fft'] = dasfft.amplitudes_fft( message.value()['amplitudes']) self.log.info(f'End : __get_message({partition},{offset})') return message
class AvroConsumerFacade: def __init__(self, name, emit_datum, broker, schema_registry_url, topic): self.name = name self.emit_datum = emit_datum self.consumer = AvroConsumer({ 'bootstrap.servers': broker, 'group.id': name, 'schema.registry.url': schema_registry_url, **get_sr_config_from_environment(), **get_kafka_config_from_environment(), }) # Subscribe to topics/partitions, and seek to end. Following that we need # to poll until the topics have actually been assigned. def on_assign(consumer, partitions): for p in partitions: p.offset = OFFSET_END self.consumer.assign(partitions) self.consumer.subscribe([topic], on_assign=on_assign) self.consumer.poll(10) def consume_one(self, poll_wait=0): consumed_message = self.consumer.poll(poll_wait) if consumed_message is not None: self.emit_datum(Datum(good_count=1)) else: self.emit_datum(Datum(bad_count=1)) def close(self): self.consumer.commit() self.consumer.close()
def target_topic_avro_consumer(unittest_config: Config, target_topic: Tuple[str, int]) -> AvroConsumer: consumer = AvroConsumer( { "group.id": "asdf", "enable.auto.commit": False, "enable.partition.eof": False, **unittest_config.create_confluent_config(include_schema_registry=True), } ) consumer.assign([TopicPartition(topic=target_topic[0], partition=i, offset=0) for i in range(target_topic[1])]) yield consumer consumer.close()
def read_from_offset(self, offset=1000): c = AvroConsumer( dict( self.base_config, **{ 'group.id': 'groupid-1', 'default.topic.config': { 'auto.offset.reset': 'beginning', 'auto.commit.enable': 'false' } })) c.assign([TopicPartition(self.topic, partition=0, offset=offset)]) return self.run_loop(c, return_message=True, file_object=False)
def read_from_start(self, persist=False, path='/'): c = AvroConsumer( dict( self.base_config, **{ 'group.id': 'groupid', 'default.topic.config': { 'auto.offset.reset': 'beginning', 'auto.commit.enable': 'false' } })) c.assign([ TopicPartition(self.topic, partition=0, offset=confluent_kafka.OFFSET_BEGINNING) ]) if persist: with open(os.path.join(path, self.topic + '.txt'), 'w') as out: self.run_loop(c, file_object=out) else: self.run_loop(c)
class KafkaWorker(BaseWorker): topic_name = None consumer_name = None consumer_settings = {} commit_on_complete = False async_commit = True poll_timeout = 0 auto_offset_reset = 'earliest' consumer = None last_message = None def setup(self): self.consumer = AvroConsumer(self.get_consumer_settings()) self.consumer.subscribe([self.get_topic_name()]) def teardown(self): if self.consumer: self.consumer.close() def get_topic_name(self): return self.topic_name or utils.config_missing('topic name') def get_consumer_name(self): return self.consumer_name or utils.generate_random_consumer_name() def get_consumer_settings(self): default_settings = { 'group.id': self.get_consumer_name(), 'default.topic.config': {'auto.offset.reset': self.auto_offset_reset}, 'enable.auto.commit': False, 'bootstrap.servers': utils.get_broker_url(), 'schema.registry.url': utils.get_schema_registry_url(), 'session.timeout.ms': 10000, 'heartbeat.interval.ms': 1000, 'api.version.request': True, } return utils.generate_client_settings(default_settings, self.consumer_settings) def poll(self): message = self.consumer.poll(timeout=self.poll_timeout) if message is not None: self.last_message = message return message def get_partitions(self): partitions = self.consumer.assignment() if not partitions: self.poll() partitions = self.consumer.assignment() return partitions def get_current_offsets(self): return self.consumer.position(self.get_partitions()) def reset_consumer_offsets(self, offset): self.consumer.assign([TopicPartition(tp.topic, tp.partition, offset) for tp in self.get_partitions()]) def seek_to_timestamp(self, timestamp): timestamp_ms = dt_to_unix_ms(timestamp) partitions = self.get_partitions() for tp in partitions: tp.offset = timestamp_ms partitions = self.consumer.offsets_for_times(partitions) self.consumer.assign(partitions) def handle(self): message = self.poll() if message is None: self.wait() elif message.error(): if message.error().code() == KafkaError._PARTITION_EOF: self.partition_eof(message) else: raise KafkaException(message.error()) else: self._consume(message) if self.commit_on_complete: self.commit() self.done() def commit(self): if not self.consumer_settings.get('enable.auto.commit'): self.consumer.commit(asynchronous=self.async_commit) def _consume(self, message): self.consume_message(MessageValue(message)) def consume_message(self, message): pass def partition_eof(self, message): pass
from confluent_kafka import KafkaError from confluent_kafka.avro import AvroConsumer from confluent_kafka.avro.serializer import SerializerError from confluent_kafka import TopicPartition c = AvroConsumer({ 'bootstrap.servers': 'localhost:9092', 'group.id': 'messages-average', 'schema.registry.url': 'http://0.0.0.0:8081', }) Partition = TopicPartition('ten-messages-average4', 0) c.assign([Partition]) # c.seek(Partition) # print(dir(c)) # # msg = c.poll(10) # # print(msg.value(), msg.key(), msg.offset()) while True: try: msg = c.poll(10) except SerializerError as e: print("Message deserialization failed for {}: {}".format(msg, e)) break if msg is None: continue
from confluent_kafka import KafkaError from confluent_kafka import TopicPartition from confluent_kafka.avro import AvroConsumer from confluent_kafka.avro.serializer import SerializerError tp = TopicPartition('pure_project_xml', 0, 0) c = AvroConsumer({ 'bootstrap.servers': 'localhost:9092', 'group.id': 'pure_project_output_generator', 'schema.registry.url': 'http://localhost:8081', }) c.assign([tp]) assignment = c.assignment() # Need a timeout here due to this bug: https://github.com/confluentinc/confluent-kafka-python/issues/196 (first_offset, next_offset_to_create) = c.get_watermark_offsets(tp, timeout=1, cached=False) last_offset = next_offset_to_create - 1 f = open('pure_project.xml', 'w') f.write( '<?xml version="1.0"?>' + "\n" + '<project:upmprojects xmlns:common="v3.commons.pure.atira.dk" xmlns:project="v1.upmproject.pure.atira.dk">' + "\n") # range values explained: We read the topic backwards, starting with the # last offset. We use `first_offset - 1` because Python's range will stop # before it reaches that value. So the last offset used will actually be # the first offset. The last argument is the step, for which we pass -1, # because we're reading backwards.
} # no local avro schema setup when we use repository consumer = AvroConsumer(config) if (env == "local"): topicPartitionData = [ TopicPartition(apiDataTopic, p) for p in range(0, mytopicpartitions) ] topicPartitionException = [ TopicPartition(apiExceptionTopic, p) for p in range(0, mytopicpartitions) ] subscribed_topics.extend(topicPartitionException) subscribed_topics.extend(topicPartitionData) consumer.assign(subscribed_topics) else: subscribed_topics.append(apiDataTopic) subscribed_topics.append(apiExceptionTopic) consumer.subscribe(subscribed_topics) log.info("Consumer is listening for messages coming to topics: " + str(subscribed_topics)) badRecords = list() # Read all messages from 2 topics while True: try: msg = consumer.poll(2) badRecords.clear() if msg is None: continue
class AvroAsync(object): def __init__(self, topic=None, ip='localhost'): self.topic = topic self.ip = ip # os.environ['KAFKA_SERVER_IP'] self.base_config = { 'bootstrap.servers': self.ip + ':9092', 'schema.registry.url': 'http://' + self.ip + ':8081' } self.avro_consumer = AvroConsumer( dict(self.base_config, **{'group.id': 'groupid'})) self.avro_consumer.assign([TopicPartition(self.topic, 0)]) self.key_schema = avro.load(os.path.join(SCHEMAS, 'keyschema.avsc')) self.value_schema = avro.load( os.path.join(SCHEMAS, self.topic + '.avsc')) def producer(self): return AvroProducer( { 'bootstrap.servers': self.ip + ':9092', 'schema.registry.url': 'http://' + self.ip + ':8081' }, default_key_schema=self.key_schema, default_value_schema=self.value_schema) def read_new(self, accumulate=False, n_messages=8, unique=True): self.avro_consumer.subscribe([self.topic]) running = True cache = [] while running: msg = self.avro_consumer.poll() if not msg.error(): print(msg.value()) if accumulate: if len(cache) >= n_messages: self.avro_consumer.close() return cache if unique: if msg not in cache: cache.append(msg.value()) else: cache.append(msg.value()) elif msg.error().code() != KafkaError._PARTITION_EOF: print(msg.error()) running = False self.avro_consumer.close() def read_from_start(self, persist=False, return_msgs=True, path='/'): _logger.debug('Reading data from Kafka from start...') c = AvroConsumer( dict( self.base_config, **{ 'group.id': 'groupid', 'default.topic.config': { 'auto.offset.reset': 'beginning', 'auto.commit.enable': 'false' } })) c.assign([ TopicPartition(self.topic, partition=0, offset=confluent_kafka.OFFSET_BEGINNING) ]) if persist: with open(os.path.join(path, self.topic + '.txt'), 'w') as out: self.run_loop(c, file_object=out) else: return self.run_loop(c, return_message=return_msgs) def read_from_offset(self, offset=1000): c = AvroConsumer( dict( self.base_config, **{ 'group.id': 'groupid-1', 'default.topic.config': { 'auto.offset.reset': 'beginning', 'auto.commit.enable': 'false' } })) c.assign([TopicPartition(self.topic, partition=0, offset=offset)]) return self.run_loop(c, return_message=True, file_object=False) @staticmethod def run_loop(consumer, file_object=None, return_message=False): _logger.debug('Kakfa consumer initialized, looping through data...') counter = 0 msg_stack = [] last_import = time.time() - 60 while True: if counter % 10000: _logger.debug('Read {} messages from Kafka'.format(counter)) counter += 1 msg = consumer.poll(timeout=3) if file_object or return_message: try: msg_stack.append(msg.value()) except TypeError: print(msg.value()) if msg.timestamp()[1] / 1000 > last_import: break else: print(msg) if file_object: for item in msg_stack: file_object.write(json.dumps(item) + '\n') if return_message: return msg_stack print(counter)
def batch_filtering(cityfilter='ALL', mentionfilter='ALL', tagfilter='ALL'): if 'username' in request.cookies: username = request.cookies['username'] print(f"Ok, {username}, let's fetch the latest tweets!") c = AvroConsumer({ 'bootstrap.servers': BOOTSTRAP_SERVERS, 'group.id': username, 'schema.registry.url': SCHEMA_REGISTRY_URL, #'isolation.level': 'read_committed' }) c.assign([TopicPartition(TOPIC, 0, 0)]) low_offset, high_offset = c.get_watermark_offsets( TopicPartition(TOPIC, 0)) #print(f"the latest offset is {high_offset}, the low is {low_offset}") # move consumer to offset=high_offset-WINDOW_LEN (only if > 0) if high_offset - WINDOW_LEN > 0: new_offset = high_offset - WINDOW_LEN else: new_offset = low_offset c.seek(TopicPartition(TOPIC, 0, new_offset)) msgs = [] # to store the messages to be returned pos = c.position([TopicPartition(TOPIC, 0, new_offset)]) while pos[0].offset < high_offset: try: msg = c.poll(0) except SerializerError as e: print("Message deserialization failed for {}: {}".format( msg, e)) break if msg is None: continue if msg.error(): print("AvroConsumer error: {}".format(msg.error())) continue author = msg.value()['author'] content = msg.value()['content'] #kafka_timestamp = datetime.datetime.fromtimestamp(float(msg.timestamp()[1]/1000)).strftime('%H:%M:%S, %d-%m-%Y') timestamp = datetime.datetime.fromtimestamp( float(msg.value()['timestamp'])).strftime('%H:%M:%S, %d-%m-%Y') message_ts = float(msg.value()['timestamp']) location = msg.value()['location'] tags = [h[1:] for h in content.split() if h.startswith('#')] mentions = [h[1:] for h in content.split() if h.startswith('@')] display_message = f"[{author}] {content} ({location} - {timestamp})" print(f"[{author}] {content} ({location} - {timestamp})") #print(f"consumer position: {c.position([TopicPartition(TOPIC, 0, new_offset)])}") pos = c.position([TopicPartition(TOPIC, 0, new_offset)]) if cityfilter != 'ALL' and mentionfilter != 'ALL' and tagfilter != 'ALL': if (location.lower() == cityfilter) and ( mentionfilter.lower() in mentions) and (tagfilter.lower() in tags): msgs.append((display_message, message_ts)) elif cityfilter == 'ALL' and mentionfilter != 'ALL' and tagfilter != 'ALL': if (mentionfilter.lower() in mentions) and (tagfilter.lower() in tags): msgs.append((display_message, message_ts)) elif cityfilter != 'ALL' and mentionfilter == 'ALL' and tagfilter != 'ALL': if (location.lower() == cityfilter) and (tagfilter.lower() in tags): msgs.append((display_message, message_ts)) elif cityfilter != 'ALL' and mentionfilter != 'ALL' and tagfilter == 'ALL': if (location.lower() == cityfilter) and (mentionfilter.lower() in mentions): msgs.append((display_message, message_ts)) elif cityfilter != 'ALL' and mentionfilter == 'ALL' and tagfilter == 'ALL': if (location.lower() == cityfilter): msgs.append((display_message, message_ts)) elif cityfilter == 'ALL' and mentionfilter != 'ALL' and tagfilter == 'ALL': if (mentionfilter.lower() in mentions): msgs.append((display_message, message_ts)) elif cityfilter == 'ALL' and mentionfilter == 'ALL' and tagfilter != 'ALL': if (tagfilter.lower() in tags): msgs.append((display_message, message_ts)) else: msgs.append((display_message, message_ts)) c.close() # finally return dictonary of messages msgs = list( set(msgs) ) # this is done to ensure that no duplicates of a message are shown in timeline msgs = sorted(msgs, key=lambda x: x[1]) msgs = [m[0] for m in msgs] print(msgs) return {"results": msgs} else: return {"results": ['Oooops, your are not logged in...']}
def streaming_filtering(): cityfilter = request.form['cityfilter'] mentionfilter = request.form['mentionfilter'] tagfilter = request.form['tagfilter'] print(f'cityfilter: {cityfilter}') print(f'mentionfilter: {mentionfilter}') print(f'tagfilter: {tagfilter}') if 'username' in request.cookies: username = request.cookies['username'] print(f"Ok, {username}, let's stream the latest tweets!") c = AvroConsumer({ 'bootstrap.servers': BOOTSTRAP_SERVERS, 'group.id': username, 'schema.registry.url': SCHEMA_REGISTRY_URL }) c.assign([TopicPartition(TOPIC, 0, 0)]) low_offset, high_offset = c.get_watermark_offsets( TopicPartition(TOPIC, 0)) print(f"the latest offset is {high_offset}, the low is {low_offset}") print(f"consumer position: {c.position([TopicPartition(TOPIC, 0)])}") # move consumer to top c.seek(TopicPartition(TOPIC, 0, high_offset)) msgs = [] pos = c.position([TopicPartition(TOPIC, 0, high_offset)]) def gen(msgs): # generator funciton for streaming print('ciao') while True: try: msg = c.poll(1) except SerializerError as e: print("Message deserialization failed for {}: {}".format( msg, e)) break if msg is None: current_ts = time.time() msgs = [ m for m in msgs if (float(current_ts) - float(m[1])) < STREAMING_WINDOW_SECONDS ] ret_msgs = [m[0] for m in msgs] yield f' `{json.dumps(ret_msgs)}` ' continue if msg.error(): current_ts = time.time() msgs = [ m for m in msgs if (float(current_ts) - float(m[1])) < STREAMING_WINDOW_SECONDS ] ret_msgs = [m[0] for m in msgs] yield f' `{json.dumps(ret_msgs)}` ' print("AvroConsumer error: {}".format(msg.error())) continue # get message fields author = msg.value()['author'] content = msg.value()['content'] #kafka_timestamp = datetime.datetime.fromtimestamp(float(msg.timestamp()[1]/1000)).strftime('%H:%M:%S, %d-%m-%Y') timestamp = datetime.datetime.fromtimestamp( float(msg.value()['timestamp'])).strftime( '%H:%M:%S, %d-%m-%Y') location = msg.value()['location'] tags = [h[1:] for h in content.split() if h.startswith('#')] mentions = [ h[1:] for h in content.split() if h.startswith('@') ] # create display_message display_message = f"[{author}] {content} ({location} - {timestamp})" display_message = display_message.replace( "`", "'") # serve per leggere lo streaming message_ts = float(msg.value()['timestamp']) print(f"{display_message}") print( f"consumer position: {c.position([TopicPartition(TOPIC, 0, high_offset)])}" ) pos = c.position([TopicPartition(TOPIC, 0, high_offset)]) print('prima') print(f'cityfilter: {cityfilter}') print(f'mentionfilter: {mentionfilter}') print(f'tagfilter: {tagfilter}') if cityfilter != 'ALL' and mentionfilter != 'ALL' and tagfilter != 'ALL': if (location.lower() == cityfilter) and ( mentionfilter.lower() in mentions) and (tagfilter.lower() in tags): msgs.append((display_message, message_ts)) elif cityfilter == 'ALL' and mentionfilter != 'ALL' and tagfilter != 'ALL': if (mentionfilter.lower() in mentions) and (tagfilter.lower() in tags): msgs.append((display_message, message_ts)) elif cityfilter != 'ALL' and mentionfilter == 'ALL' and tagfilter != 'ALL': if (location.lower() == cityfilter) and (tagfilter.lower() in tags): msgs.append((display_message, message_ts)) elif cityfilter != 'ALL' and mentionfilter != 'ALL' and tagfilter == 'ALL': if (location.lower() == cityfilter) and (mentionfilter.lower() in mentions): msgs.append((display_message, message_ts)) elif cityfilter != 'ALL' and mentionfilter == 'ALL' and tagfilter == 'ALL': if (location.lower() == cityfilter): msgs.append((display_message, message_ts)) elif cityfilter == 'ALL' and mentionfilter != 'ALL' and tagfilter == 'ALL': if (mentionfilter.lower() in mentions): msgs.append((display_message, message_ts)) elif cityfilter == 'ALL' and mentionfilter == 'ALL' and tagfilter != 'ALL': if (tagfilter.lower() in tags): msgs.append((display_message, message_ts)) else: msgs.append((display_message, message_ts)) # remove old messages current_ts = time.time() msgs = [ m for m in msgs if (float(current_ts) - float(m[1])) < STREAMING_WINDOW_SECONDS ] #msgs = list(set(msgs)) msgs = sorted(msgs, key=lambda x: x[1]) ret_msgs = [m[0] for m in msgs] yield f' `{json.dumps(ret_msgs)}` ' return Response(stream_with_context(gen(msgs))) else: return {"results": ['Oooops, your are not logged in...']}
students_average =AvroConsumer({ 'bootstrap.servers': 'localhost:9092', 'group.id': 'students_average', 'schema.registry.url': 'http://0.0.0.0:8081', 'auto.offset.reset': 'earliest' }) p0 = TopicPartition('students_result_source', 0) p1 = TopicPartition('students_result_source', 1) p2 = TopicPartition('students_result_source', 2) # c.assign([Partition]) c.subscribe(['students_result_source']) students_average.subscribe(['STUDENTS_AVERAGE']) c_partition0.assign([p0]) c_partition1.assign([p1]) c_partition2.assign([p2]) searcher=[c_partition0, c_partition1, c_partition2] key_schema_str = """ { "name": "average_key", "type": "int" } """ value_schema_str = """ { "name": "average_value",
class KafkaWorker(BaseWorker): topic_name = None consumer_name = None consumer_settings = {} commit_on_complete = False async_commit = True poll_timeout = 0 auto_offset_reset = 'earliest' consumer = None last_message = None def setup(self): self.consumer = AvroConsumer(self.get_consumer_settings()) self.consumer.subscribe([self.get_topic_name()]) def teardown(self): if self.consumer: self.consumer.close() def get_topic_name(self): return self.topic_name or utils.config_missing('topic name') def get_consumer_name(self): return self.consumer_name or utils.generate_random_consumer_name() def get_consumer_settings(self): default_settings = { 'group.id': self.get_consumer_name(), 'default.topic.config': {'auto.offset.reset': self.auto_offset_reset}, 'enable.auto.commit': False, 'bootstrap.servers': utils.get_broker_url(), 'schema.registry.url': utils.get_schema_registry_url(), 'session.timeout.ms': 10000, 'heartbeat.interval.ms': 1000, 'api.version.request': True, } return utils.generate_client_settings(default_settings, self.consumer_settings) def poll(self): message = self.consumer.poll(timeout=self.poll_timeout) if message is not None: self.last_message = message return message def get_partitions(self): partitions = self.consumer.assignment() if not partitions: self.poll() partitions = self.consumer.assignment() return partitions def get_current_offsets(self): return self.consumer.position(self.get_partitions()) def reset_consumer_offsets(self, offset): self.consumer.assign([TopicPartition(tp.topic, tp.partition, offset) for tp in self.get_partitions()]) def seek_to_timestamp(self, timestamp): timestamp_ms = dt_to_unix_ms(timestamp) partitions = self.get_partitions() for tp in partitions: tp.offset = timestamp_ms partitions = self.consumer.offsets_for_times(partitions) self.consumer.assign(partitions) def handle(self): message = self.poll() if message is None: self.wait() elif message.error(): if message.error().code() == KafkaError._PARTITION_EOF: self.partition_eof(message) else: raise KafkaException(message.error()) else: self._consume(message) if self.commit_on_complete: self.commit() self.done() def commit(self): if not self.consumer_settings.get('enable.auto.commit'): self.consumer.commit(async=self.async_commit) def _consume(self, message): self.consume_message(MessageValue(message)) def consume_message(self, message): pass def partition_eof(self, message): pass
class Consumer: def __init__(self, broker, schema_registry, topic=None, logging_enabled=False, group_id=None, auto_commit=True): """ Initialiser for Confluent Consumer using AvroConsumer. Each consumer can only be subscribed to one topic Parameters ---------- broker: str The URL of the broker (example: 'localhost:9092') schema_registry: str The URL of the confluent Schema Registry endpoint (example: 'http://localhost:8081') topic: str The topic to subscribe too logger: Logger object, Optional The logger object which will be used to log messages if provided groupId: str, Optional An optional groupId which can be used to loadbalance consumers default is "asgard" """ if group_id is None: new_hash = hashlib.sha1() new_hash.update(str(time.time()).encode("utf-8")) group_id = new_hash.hexdigest() self.__consumer = AvroConsumer({ "bootstrap.servers": broker, "group.id": group_id, "schema.registry.url": schema_registry, "enable.auto.commit": auto_commit }) self.__consumer_non_avro = KafkaConsumer({ "bootstrap.servers": broker, "group.id": group_id + "0", "enable.auto.commit": auto_commit }) self.auto_commit = auto_commit if not auto_commit: self.consumed_messages = PriorityQueue() if not topic is None: self.subscribe_to_topic(topic) else: self.topic = None if logging_enabled: self.logger = logging.getLogger(__name__) else: self.logger = None def consume(self, timeout=1): """ Method to consume and return message if exists and can be deserialized Returns ------- str The recieved message payload as a string None No message has been recieved or an error has occured """ if not self.topic is None: msg = None non_avro = False try: msg = self.__consumer.poll(timeout) except SerializerError as e: try: msg = self.__consumer_non_avro.poll(timeout) non_avro = True except Exception as e: self.__log_msg( "Message deserialization has failed {}: {}".format( msg, e), "See the following stack trace", f"{traceback.format_exc()}", delimeter="\n", level="ERROR") except RuntimeError as e: self.__log_msg( "The consumer has been closed and cannot recieve messages", level="ERROR") except Exception as e: self.__log_msg("An unkown error has occured {}".format(e), "See the following stack trace", f"{traceback.format_exc()}", delimeter="\n", level="ERROR") if not msg is None: if msg.error(): self.__log_msg("AvroConsumer error: {}".format( msg.error()), level="ERROR") else: if not self.auto_commit: self.consumed_messages.put_nowait(msg) if non_avro: data_to_be_returned = json.loads(msg.value().decode()) else: data_to_be_returned = msg.value() return data_to_be_returned else: raise ValueError("Consumer is currently not subscribed to a topic") def __enter__(self): return self.__consumer def __exit__(self, *args): self.close() def __log_msg( self, *messages, level="NOTSET", delimeter=" ", ): levels = { "CRITICAL": logging.CRITICAL, "ERROR": logging.ERROR, "WARNING": logging.WARNING, "INFO": logging.INFO, "DEBUG": logging.DEBUG, "NOTSET": logging.NOTSET } msg = delimeter.join(messages) if self.logger is not None: if level not in levels: raise ValueError( f"level {level} is not valid must be one of {list(levels.keys())}" ) self.logger.log(levels[level], msg) else: if level is not None: print(f"LOGGED MESSAGE: {msg}") else: print(f"{level}: {msg}") def commit(self, asynchronous=True): if not self.auto_commit and not self.consumed_messages.empty(): msg = self.consumed_messages.get_nowait() self.__consumer.commit(msg, asynchronous=asynchronous) def list_topics(self, topic=None, timeout=1): try: metadata = self.__consumer.list_topics(topic, timeout) topics = metadata.topics return list(topics.keys()) except Exception as e: self.__log_msg( f"An unknown error has occured when trying to list topics {e}", "ERROR") self.logger.debug(e) def check_if_topic_exists(self, topic, timeout=1): topic_list = self.list_topics(timeout=timeout) if topic_list is not None: return topic in topic_list def subscribe_to_topic(self, topic): try: self.__consumer_non_avro.subscribe([topic], on_assign=self.__assign) self.__consumer.subscribe([topic], on_assign=self.__assign) self.topic = topic return True except Exception as e: self.__log_msg( "An unknown error {}".format(e), "occured while trying to subscribe to topic {}".format(topic), delimeter=" ", level="ERROR") return False def __assign(self, consumer, partitions): for p in partitions: p.offset = consumer.get_watermark_offsets(p)[1] - 1 self.__consumer.assign(partitions) self.__consumer_non_avro.assign(partitions) def close(self): """ Close the consumer, Once called this object cannot be reused """ self.__consumer.close()
def main(): # create Avro Producer for Kafka # Note that only schema registry has to be given here and deserialization of avro is handled automatically c = AvroConsumer({ 'bootstrap.servers': 'broker:29092', 'group.id': 'anomalie_training', 'schema.registry.url': 'http://schema-registry:8081' }) # subscribe to topic c.subscribe(['anomalie_tutorial']) # We need to change Kafka offset in order to consume from beginning. # There is no straightforward way to archive this, so first message had to # be polled in order that assignment can be obtained. Afterwards assignment # (in form of topic partition) is changed by setting the offset to the beginning. msg = c.poll(10) topic_partition = c.assignment() for partition in topic_partition: partition.offset = OFFSET_BEGINNING c.assign(topic_partition) # Consume messages frop topic messages = [] while True: msg = c.poll(1) if msg is None: break messages.append(msg.value()) c.close() # transform messages to Pandas DataFrame and feature engineering df = pd.DataFrame(messages) df.timestamp = pd.to_datetime(df.timestamp * 1000000) df['hour'] = df.timestamp.dt.hour df['business_hour'] = ((df.hour < 8) | (df.hour > 18)).astype("int") df.drop(["hour"], axis=1, inplace=True) # train test split # note that we can not use sklearn.model_selection.train_test_split as this is a time series and random split is not an option! train_length = int(len(df) * 0.6) x_train = df.drop("timestamp", axis=1).iloc[:train_length, :] x_test = df.drop("timestamp", axis=1).iloc[train_length:, :] # Train Machine Learning Model, here Isolation Forests # contamination is import parameter. Determines how many datapoints will be classified as anomalous. iso_forest = IsolationForest(n_estimators=100, contamination=float(.02)) iso_forest.fit(x_train) dump(iso_forest, '/data/iso_forest.joblib') # make predictions on test set predictions = iso_forest.predict(x_test) # make plot for evaluation and save figure evaluate_anomalies(predictions, df, train_length)