def consume(topic, conf): """ Consume User records """ from confluent_kafka.avro import AvroConsumer from confluent_kafka.avro.serializer import SerializerError print("Consuming user records from topic {} with group {}. ^c to exit.".format(topic, conf["group.id"])) c = AvroConsumer(conf, reader_value_schema=record_schema) c.subscribe([topic]) while True: try: msg = c.poll(1) # There were no messages on the queue, continue polling if msg is None: continue if msg.error(): print("Consumer error: {}".format(msg.error())) continue record = User(msg.value()) print("name: {}\n\tfavorite_number: {}\n\tfavorite_color: {}\n".format( record.name, record.favorite_number, record.favorite_color)) except SerializerError as e: # Report malformed record, discard results, continue polling print("Message deserialization failed {}".format(e)) continue except KeyboardInterrupt: break print("Shutting down consumer..") c.close()
class KafkaWorker(BaseWorker): topic_name = None consumer_name = None consumer_settings = {} commit_on_complete = False async_commit = True poll_timeout = 0 auto_offset_reset = 'earliest' consumer = None last_message = None def setup(self): self.consumer = AvroConsumer(self.get_consumer_settings()) self.consumer.subscribe([self.get_topic_name()]) def teardown(self): if self.consumer: self.consumer.close() def get_topic_name(self): return self.topic_name or utils.config_missing('topic name') def get_consumer_name(self): return self.consumer_name or utils.generate_random_consumer_name() def get_consumer_settings(self): default_settings = { 'group.id': self.get_consumer_name(), 'default.topic.config': {'auto.offset.reset': self.auto_offset_reset}, 'enable.auto.commit': False, 'bootstrap.servers': utils.get_broker_url(), 'schema.registry.url': utils.get_schema_registry_url(), 'session.timeout.ms': 10000, 'heartbeat.interval.ms': 1000, 'api.version.request': True, } return utils.generate_client_settings(default_settings, self.consumer_settings) def poll(self): message = self.consumer.poll(timeout=self.poll_timeout) if message is not None: self.last_message = message return message def get_partitions(self): partitions = self.consumer.assignment() if not partitions: self.poll() partitions = self.consumer.assignment() return partitions def get_current_offsets(self): return self.consumer.position(self.get_partitions()) def reset_consumer_offsets(self, offset): self.consumer.assign([TopicPartition(tp.topic, tp.partition, offset) for tp in self.get_partitions()]) def seek_to_timestamp(self, timestamp): timestamp_ms = dt_to_unix_ms(timestamp) partitions = self.get_partitions() for tp in partitions: tp.offset = timestamp_ms partitions = self.consumer.offsets_for_times(partitions) self.consumer.assign(partitions) def handle(self): message = self.poll() if message is None: self.wait() elif message.error(): if message.error().code() == KafkaError._PARTITION_EOF: self.partition_eof(message) else: raise KafkaException(message.error()) else: self._consume(message) if self.commit_on_complete: self.commit() self.done() def commit(self): if not self.consumer_settings.get('enable.auto.commit'): self.consumer.commit(async=self.async_commit) def _consume(self, message): self.consume_message(MessageValue(message)) def consume_message(self, message): pass def partition_eof(self, message): pass
class SensorConsumer(object): def __init__(self, *args, **kwargs): self.logger = kwargs.get('logger') self.TOPIC = kwargs.get('TOPIC', 's00') self.producer = args[0] self.bootstrap_servers = kwargs.get('BOOTSTRAP_SERVERS') self.schema_registry_url = kwargs.get("SCHEMA_REGISTRY_URL") self.group_id = kwargs.get("GROUP_ID") self.auto_offset = kwargs.get("auto_offset", "latest") self.Q = kwargs.get("Q") self.MAX_WORKERS = kwargs.get("MAX_WORKERS") self.consumer = AvroConsumer({ 'bootstrap.servers': self.bootstrap_servers, 'group.id': self.group_id, 'auto.offset.reset': self.auto_offset, 'schema.registry.url': self.schema_registry_url }) self.qrs_detector = kwargs.get('qrs_detector') def produceMessage(self): isRunning = True while isRunning: value = self.Q.get() key = {'prefix': 'sid', 'sensorId': value['sensorId']} threadName = currentThread().getName() self.producer.produce(key, value) self.Q.task_done() print("Sent message to kafka [{}]".format(threadName)) def start(self): self.consumer.subscribe([self.TOPIC]) for i in range(self.MAX_WORKERS): Thread(name="Thread-{}".format(i), daemon=True, target=self.produceMessage).start() with ThreadPoolExecutor(max_workers=self.MAX_WORKERS) as executor: while True: message = None try: message = self.consumer.poll(10) # Q.put(message) except SerializerError as e: print("Message deserialization failed for {}: {}".format( message, e)) break if message is None: continue elif message.error(): self.logger.error("Consumer error: {}".format( message.error())) else: data = message.value() sensor_id = data['sensorId'] value = executor.submit(self.qrs_detector.run, sensor_id, data['measurement'], data['timestamp']) self.consumer.close()
if msg is None: # No message available within timeout. # Initial message consumption may take up to # `session.timeout.ms` for the consumer group to # rebalance and start consuming print("Waiting for message or event/error in poll()") continue elif msg.error(): print('error: {}'.format(msg.error())) else: # Check for Kafka message record_key = ccloud_lib.Name(msg.key()) name_object = record_key.name name = name_object['name'] record_value = ccloud_lib.Count(msg.value()) count_object = record_value.count count = count_object['count'] total_count += count print("Consumed record with key {} and value {}, \ and updated total count to {}".format( name, count, total_count)) except SerializerError as e: # Report malformed record, discard results, continue polling print("Message deserialization failed {}".format(e)) pass except KeyboardInterrupt: pass finally: # Leave group and commit final offsets c.close()
class AvroConsumerApi(ABCMbApi): """ This class implements the Interface for Kafka consumer carrying Avro messages. It is expected that the users would extend this class and override handle_message function. """ def __init__(self, *, consumer_conf: dict, key_schema_location, value_schema_location: str, topics: List[str], batch_size: int = 5, logger: logging.Logger = None, sync: bool = False): super(AvroConsumerApi, self).__init__(logger=logger) self.key_schema = self.load_schema(schema_file=key_schema_location) self.value_schema = self.load_schema(schema_file=value_schema_location) self.consumer = AvroConsumer(consumer_conf, reader_key_schema=self.key_schema, reader_value_schema=self.value_schema) self.running = True self.topics = topics self.batch_size = batch_size self.sync = sync def shutdown(self): """ Shutdown the consumer :return: """ self.logger.debug("Trigger shutdown") self.running = False @staticmethod def _create_instance(*, module_name: str, class_name: str): module = importlib.import_module(module_name) class_ = getattr(module, class_name) return class_() @staticmethod def _create_instance_with_params(*, module_name: str, class_name: str): module = importlib.import_module(module_name) class_ = getattr(module, class_name) return class_ def process_message(self, topic: str, key: dict, value: dict): """ Process the incoming message. Must be overridden in the derived class :param topic: topic name :param key: incoming message key :param value: incoming message value :return: """ self.logger.debug("KAFKA: Message received for topic " + topic) self.logger.debug("KAFKA: Key = {}".format(key)) self.logger.debug("KAFKA: Value = {}".format(value)) class_name = value.get('name', None) + 'Avro' self.logger.debug("KAFKA: class_name = {}".format(class_name)) module_name = f"fabric_mb.message_bus.messages.{re.sub(r'(?<!^)(?=[A-Z])', '_', class_name).lower()}" self.logger.debug(f"KAFKA: module_name = {module_name}") message = self._create_instance(module_name=module_name, class_name=class_name) message.from_dict(value) self.handle_message(message=message) def handle_message(self, message: AbcMessageAvro): """ Handle incoming message; must be overridden by the derived class :param message: incoming message """ print(message) def consume(self): """ Consume records unless shutdown triggered. Using synchronous commit after a message batch. """ self.consumer.subscribe(self.topics) msg_count = 0 while self.running: try: msg = self.consumer.poll(1) # There were no messages on the queue, continue polling if msg is None: continue if msg.error(): self.logger.error(f"KAFKA: Consumer error: {msg.error()}") if msg.error().code() == KafkaError._PARTITION_EOF: # End of partition event self.logger.error( f"KAFKA: {msg.topic()} {msg.partition} reached end at offset [{msg.offset()}]" ) elif msg.error(): self.logger.error( f"KAFKA: Consumer error: {msg.error()}") continue self.process_message(msg.topic(), msg.key(), msg.value()) if self.sync: msg_count += 1 if msg_count % self.batch_size == 0: self.consumer.commit(asynchronous=False) except SerializerError as e: # Report malformed record, discard results, continue polling self.logger.error(f"KAFKA: Message deserialization failed {e}") continue except KeyboardInterrupt: break self.logger.debug("KAFKA: Shutting down consumer..") self.consumer.close()
def main(): # create Avro Producer for Kafka # Note that only schema registry has to be given here and deserialization of avro is handled automatically c = AvroConsumer({ 'bootstrap.servers': 'broker:29092', 'group.id': 'anomalie_training', 'schema.registry.url': 'http://schema-registry:8081' }) # subscribe to topic c.subscribe(['anomalie_tutorial']) # We need to change Kafka offset in order to consume from beginning. # There is no straightforward way to archive this, so first message had to # be polled in order that assignment can be obtained. Afterwards assignment # (in form of topic partition) is changed by setting the offset to the beginning. msg = c.poll(10) topic_partition = c.assignment() for partition in topic_partition: partition.offset = OFFSET_BEGINNING c.assign(topic_partition) # Consume messages frop topic messages = [] while True: msg = c.poll(1) if msg is None: break messages.append(msg.value()) c.close() # transform messages to Pandas DataFrame and feature engineering df = pd.DataFrame(messages) df.timestamp = pd.to_datetime(df.timestamp * 1000000) df['hour'] = df.timestamp.dt.hour df['business_hour'] = ((df.hour < 8) | (df.hour > 18)).astype("int") df.drop(["hour"], axis=1, inplace=True) # train test split # note that we can not use sklearn.model_selection.train_test_split as this is a time series and random split is not an option! train_length = int(len(df) * 0.6) x_train = df.drop("timestamp", axis=1).iloc[:train_length, :] x_test = df.drop("timestamp", axis=1).iloc[train_length:, :] # Train Machine Learning Model, here Isolation Forests # contamination is import parameter. Determines how many datapoints will be classified as anomalous. iso_forest = IsolationForest(n_estimators=100, contamination=float(.02)) iso_forest.fit(x_train) dump(iso_forest, '/data/iso_forest.joblib') # make predictions on test set predictions = iso_forest.predict(x_test) # make plot for evaluation and save figure evaluate_anomalies(predictions, df, train_length)
def batch_filtering(cityfilter='ALL', mentionfilter='ALL', tagfilter='ALL'): if 'username' in request.cookies: username = request.cookies['username'] print(f"Ok, {username}, let's fetch the latest tweets!") c = AvroConsumer({ 'bootstrap.servers': BOOTSTRAP_SERVERS, 'group.id': username, 'schema.registry.url': SCHEMA_REGISTRY_URL, #'isolation.level': 'read_committed' }) c.assign([TopicPartition(TOPIC, 0, 0)]) low_offset, high_offset = c.get_watermark_offsets( TopicPartition(TOPIC, 0)) #print(f"the latest offset is {high_offset}, the low is {low_offset}") # move consumer to offset=high_offset-WINDOW_LEN (only if > 0) if high_offset - WINDOW_LEN > 0: new_offset = high_offset - WINDOW_LEN else: new_offset = low_offset c.seek(TopicPartition(TOPIC, 0, new_offset)) msgs = [] # to store the messages to be returned pos = c.position([TopicPartition(TOPIC, 0, new_offset)]) while pos[0].offset < high_offset: try: msg = c.poll(0) except SerializerError as e: print("Message deserialization failed for {}: {}".format( msg, e)) break if msg is None: continue if msg.error(): print("AvroConsumer error: {}".format(msg.error())) continue author = msg.value()['author'] content = msg.value()['content'] #kafka_timestamp = datetime.datetime.fromtimestamp(float(msg.timestamp()[1]/1000)).strftime('%H:%M:%S, %d-%m-%Y') timestamp = datetime.datetime.fromtimestamp( float(msg.value()['timestamp'])).strftime('%H:%M:%S, %d-%m-%Y') message_ts = float(msg.value()['timestamp']) location = msg.value()['location'] tags = [h[1:] for h in content.split() if h.startswith('#')] mentions = [h[1:] for h in content.split() if h.startswith('@')] display_message = f"[{author}] {content} ({location} - {timestamp})" print(f"[{author}] {content} ({location} - {timestamp})") #print(f"consumer position: {c.position([TopicPartition(TOPIC, 0, new_offset)])}") pos = c.position([TopicPartition(TOPIC, 0, new_offset)]) if cityfilter != 'ALL' and mentionfilter != 'ALL' and tagfilter != 'ALL': if (location.lower() == cityfilter) and ( mentionfilter.lower() in mentions) and (tagfilter.lower() in tags): msgs.append((display_message, message_ts)) elif cityfilter == 'ALL' and mentionfilter != 'ALL' and tagfilter != 'ALL': if (mentionfilter.lower() in mentions) and (tagfilter.lower() in tags): msgs.append((display_message, message_ts)) elif cityfilter != 'ALL' and mentionfilter == 'ALL' and tagfilter != 'ALL': if (location.lower() == cityfilter) and (tagfilter.lower() in tags): msgs.append((display_message, message_ts)) elif cityfilter != 'ALL' and mentionfilter != 'ALL' and tagfilter == 'ALL': if (location.lower() == cityfilter) and (mentionfilter.lower() in mentions): msgs.append((display_message, message_ts)) elif cityfilter != 'ALL' and mentionfilter == 'ALL' and tagfilter == 'ALL': if (location.lower() == cityfilter): msgs.append((display_message, message_ts)) elif cityfilter == 'ALL' and mentionfilter != 'ALL' and tagfilter == 'ALL': if (mentionfilter.lower() in mentions): msgs.append((display_message, message_ts)) elif cityfilter == 'ALL' and mentionfilter == 'ALL' and tagfilter != 'ALL': if (tagfilter.lower() in tags): msgs.append((display_message, message_ts)) else: msgs.append((display_message, message_ts)) c.close() # finally return dictonary of messages msgs = list( set(msgs) ) # this is done to ensure that no duplicates of a message are shown in timeline msgs = sorted(msgs, key=lambda x: x[1]) msgs = [m[0] for m in msgs] print(msgs) return {"results": msgs} else: return {"results": ['Oooops, your are not logged in...']}