def consume(topic, conf):
    """
        Consume User records
    """
    from confluent_kafka.avro import AvroConsumer
    from confluent_kafka.avro.serializer import SerializerError

    print("Consuming user records from topic {} with group {}. ^c to exit.".format(topic, conf["group.id"]))

    c = AvroConsumer(conf, reader_value_schema=record_schema)
    c.subscribe([topic])

    while True:
        try:
            msg = c.poll(1)

            # There were no messages on the queue, continue polling
            if msg is None:
                continue

            if msg.error():
                print("Consumer error: {}".format(msg.error()))
                continue

            record = User(msg.value())
            print("name: {}\n\tfavorite_number: {}\n\tfavorite_color: {}\n".format(
                record.name, record.favorite_number, record.favorite_color))
        except SerializerError as e:
            # Report malformed record, discard results, continue polling
            print("Message deserialization failed {}".format(e))
            continue
        except KeyboardInterrupt:
            break

    print("Shutting down consumer..")
    c.close()
Example #2
0
class KafkaWorker(BaseWorker):
    topic_name = None
    consumer_name = None
    consumer_settings = {}
    commit_on_complete = False
    async_commit = True
    poll_timeout = 0
    auto_offset_reset = 'earliest'
    consumer = None
    last_message = None

    def setup(self):
        self.consumer = AvroConsumer(self.get_consumer_settings())
        self.consumer.subscribe([self.get_topic_name()])

    def teardown(self):
        if self.consumer:
            self.consumer.close()

    def get_topic_name(self):
        return self.topic_name or utils.config_missing('topic name')

    def get_consumer_name(self):
        return self.consumer_name or utils.generate_random_consumer_name()

    def get_consumer_settings(self):
        default_settings = {
            'group.id': self.get_consumer_name(),
            'default.topic.config': {'auto.offset.reset': self.auto_offset_reset},
            'enable.auto.commit': False,
            'bootstrap.servers': utils.get_broker_url(),
            'schema.registry.url': utils.get_schema_registry_url(),
            'session.timeout.ms': 10000,
            'heartbeat.interval.ms': 1000,
            'api.version.request': True,
        }
        return utils.generate_client_settings(default_settings, self.consumer_settings)

    def poll(self):
        message = self.consumer.poll(timeout=self.poll_timeout)
        if message is not None:
            self.last_message = message
        return message

    def get_partitions(self):
        partitions = self.consumer.assignment()
        if not partitions:
            self.poll()
            partitions = self.consumer.assignment()
        return partitions

    def get_current_offsets(self):
        return self.consumer.position(self.get_partitions())

    def reset_consumer_offsets(self, offset):
        self.consumer.assign([TopicPartition(tp.topic, tp.partition, offset)
                              for tp in self.get_partitions()])

    def seek_to_timestamp(self, timestamp):
        timestamp_ms = dt_to_unix_ms(timestamp)
        partitions = self.get_partitions()
        for tp in partitions:
            tp.offset = timestamp_ms
        partitions = self.consumer.offsets_for_times(partitions)
        self.consumer.assign(partitions)

    def handle(self):
        message = self.poll()

        if message is None:
            self.wait()

        elif message.error():
            if message.error().code() == KafkaError._PARTITION_EOF:
                self.partition_eof(message)

            else:
                raise KafkaException(message.error())

        else:
            self._consume(message)

            if self.commit_on_complete:
                self.commit()

        self.done()

    def commit(self):
        if not self.consumer_settings.get('enable.auto.commit'):
            self.consumer.commit(async=self.async_commit)

    def _consume(self, message):
        self.consume_message(MessageValue(message))

    def consume_message(self, message):
        pass

    def partition_eof(self, message):
        pass
Example #3
0
class SensorConsumer(object):
    def __init__(self, *args, **kwargs):
        self.logger = kwargs.get('logger')
        self.TOPIC = kwargs.get('TOPIC', 's00')
        self.producer = args[0]
        self.bootstrap_servers = kwargs.get('BOOTSTRAP_SERVERS')
        self.schema_registry_url = kwargs.get("SCHEMA_REGISTRY_URL")
        self.group_id = kwargs.get("GROUP_ID")
        self.auto_offset = kwargs.get("auto_offset", "latest")
        self.Q = kwargs.get("Q")
        self.MAX_WORKERS = kwargs.get("MAX_WORKERS")

        self.consumer = AvroConsumer({
            'bootstrap.servers':
            self.bootstrap_servers,
            'group.id':
            self.group_id,
            'auto.offset.reset':
            self.auto_offset,
            'schema.registry.url':
            self.schema_registry_url
        })

        self.qrs_detector = kwargs.get('qrs_detector')

    def produceMessage(self):
        isRunning = True
        while isRunning:
            value = self.Q.get()
            key = {'prefix': 'sid', 'sensorId': value['sensorId']}
            threadName = currentThread().getName()
            self.producer.produce(key, value)
            self.Q.task_done()
            print("Sent message to kafka [{}]".format(threadName))

    def start(self):
        self.consumer.subscribe([self.TOPIC])
        for i in range(self.MAX_WORKERS):
            Thread(name="Thread-{}".format(i),
                   daemon=True,
                   target=self.produceMessage).start()

        with ThreadPoolExecutor(max_workers=self.MAX_WORKERS) as executor:
            while True:
                message = None
                try:
                    message = self.consumer.poll(10)
                    # Q.put(message)
                except SerializerError as e:
                    print("Message deserialization failed for {}: {}".format(
                        message, e))
                    break

                if message is None:
                    continue
                elif message.error():
                    self.logger.error("Consumer error: {}".format(
                        message.error()))
                else:
                    data = message.value()
                    sensor_id = data['sensorId']
                    value = executor.submit(self.qrs_detector.run, sensor_id,
                                            data['measurement'],
                                            data['timestamp'])

        self.consumer.close()
Example #4
0
            if msg is None:
                # No message available within timeout.
                # Initial message consumption may take up to
                # `session.timeout.ms` for the consumer group to
                # rebalance and start consuming
                print("Waiting for message or event/error in poll()")
                continue
            elif msg.error():
                print('error: {}'.format(msg.error()))
            else:
                # Check for Kafka message
                record_key = ccloud_lib.Name(msg.key())
                name_object = record_key.name
                name = name_object['name']
                record_value = ccloud_lib.Count(msg.value())
                count_object = record_value.count
                count = count_object['count']
                total_count += count
                print("Consumed record with key {} and value {}, \
                      and updated total count to {}".format(
                    name, count, total_count))
    except SerializerError as e:
        # Report malformed record, discard results, continue polling
        print("Message deserialization failed {}".format(e))
        pass
    except KeyboardInterrupt:
        pass
    finally:
        # Leave group and commit final offsets
        c.close()
Example #5
0
class AvroConsumerApi(ABCMbApi):
    """
    This class implements the Interface for Kafka consumer carrying Avro messages.
    It is expected that the users would extend this class and override handle_message function.
    """
    def __init__(self,
                 *,
                 consumer_conf: dict,
                 key_schema_location,
                 value_schema_location: str,
                 topics: List[str],
                 batch_size: int = 5,
                 logger: logging.Logger = None,
                 sync: bool = False):
        super(AvroConsumerApi, self).__init__(logger=logger)

        self.key_schema = self.load_schema(schema_file=key_schema_location)
        self.value_schema = self.load_schema(schema_file=value_schema_location)

        self.consumer = AvroConsumer(consumer_conf,
                                     reader_key_schema=self.key_schema,
                                     reader_value_schema=self.value_schema)
        self.running = True
        self.topics = topics
        self.batch_size = batch_size
        self.sync = sync

    def shutdown(self):
        """
        Shutdown the consumer
        :return:
        """
        self.logger.debug("Trigger shutdown")
        self.running = False

    @staticmethod
    def _create_instance(*, module_name: str, class_name: str):
        module = importlib.import_module(module_name)
        class_ = getattr(module, class_name)
        return class_()

    @staticmethod
    def _create_instance_with_params(*, module_name: str, class_name: str):
        module = importlib.import_module(module_name)
        class_ = getattr(module, class_name)
        return class_

    def process_message(self, topic: str, key: dict, value: dict):
        """
        Process the incoming message. Must be overridden in the derived class
        :param topic: topic name
        :param key: incoming message key
        :param value: incoming message value
        :return:
        """
        self.logger.debug("KAFKA: Message received for topic " + topic)
        self.logger.debug("KAFKA: Key = {}".format(key))
        self.logger.debug("KAFKA: Value = {}".format(value))
        class_name = value.get('name', None) + 'Avro'
        self.logger.debug("KAFKA: class_name = {}".format(class_name))
        module_name = f"fabric_mb.message_bus.messages.{re.sub(r'(?<!^)(?=[A-Z])', '_', class_name).lower()}"
        self.logger.debug(f"KAFKA: module_name = {module_name}")

        message = self._create_instance(module_name=module_name,
                                        class_name=class_name)
        message.from_dict(value)

        self.handle_message(message=message)

    def handle_message(self, message: AbcMessageAvro):
        """
        Handle incoming message; must be overridden by the derived class
        :param message: incoming message
        """
        print(message)

    def consume(self):
        """
            Consume records unless shutdown triggered. Using synchronous commit after a message batch.
        """
        self.consumer.subscribe(self.topics)

        msg_count = 0
        while self.running:
            try:
                msg = self.consumer.poll(1)

                # There were no messages on the queue, continue polling
                if msg is None:
                    continue

                if msg.error():
                    self.logger.error(f"KAFKA: Consumer error: {msg.error()}")
                    if msg.error().code() == KafkaError._PARTITION_EOF:
                        # End of partition event
                        self.logger.error(
                            f"KAFKA: {msg.topic()} {msg.partition} reached end at offset [{msg.offset()}]"
                        )
                    elif msg.error():
                        self.logger.error(
                            f"KAFKA: Consumer error: {msg.error()}")
                        continue

                self.process_message(msg.topic(), msg.key(), msg.value())

                if self.sync:
                    msg_count += 1
                    if msg_count % self.batch_size == 0:
                        self.consumer.commit(asynchronous=False)

            except SerializerError as e:
                # Report malformed record, discard results, continue polling
                self.logger.error(f"KAFKA: Message deserialization failed {e}")
                continue
            except KeyboardInterrupt:
                break

        self.logger.debug("KAFKA: Shutting down consumer..")
        self.consumer.close()
Example #6
0
def main():

    # create Avro Producer for Kafka
    # Note that only schema registry has to be given here and deserialization of avro is handled automatically
    c = AvroConsumer({
        'bootstrap.servers': 'broker:29092',
        'group.id': 'anomalie_training',
        'schema.registry.url': 'http://schema-registry:8081'
    })

    # subscribe to topic
    c.subscribe(['anomalie_tutorial'])

    # We need to change Kafka offset in order to consume from beginning.
    # There is no straightforward way to archive this, so first message had to
    # be polled in order that assignment can be obtained. Afterwards assignment
    # (in form of topic partition) is changed by setting the offset to the beginning.
    msg = c.poll(10)
    topic_partition = c.assignment()

    for partition in topic_partition:
        partition.offset = OFFSET_BEGINNING

    c.assign(topic_partition)

    # Consume messages frop topic
    messages = []
    while True:
        msg = c.poll(1)

        if msg is None:
            break

        messages.append(msg.value())

    c.close()

    # transform messages to Pandas DataFrame and feature engineering
    df = pd.DataFrame(messages)
    df.timestamp = pd.to_datetime(df.timestamp * 1000000)

    df['hour'] = df.timestamp.dt.hour
    df['business_hour'] = ((df.hour < 8) | (df.hour > 18)).astype("int")
    df.drop(["hour"], axis=1, inplace=True)

    # train test split
    # note that we can not use sklearn.model_selection.train_test_split as this is a time series and random split is not an option!
    train_length = int(len(df) * 0.6)
    x_train = df.drop("timestamp", axis=1).iloc[:train_length, :]
    x_test = df.drop("timestamp", axis=1).iloc[train_length:, :]

    # Train Machine Learning Model, here Isolation Forests
    # contamination is import parameter. Determines how many datapoints will be classified as anomalous.
    iso_forest = IsolationForest(n_estimators=100, contamination=float(.02))
    iso_forest.fit(x_train)
    dump(iso_forest, '/data/iso_forest.joblib')

    # make predictions on test set
    predictions = iso_forest.predict(x_test)

    # make plot for evaluation and save figure
    evaluate_anomalies(predictions, df, train_length)
Example #7
0
def batch_filtering(cityfilter='ALL', mentionfilter='ALL', tagfilter='ALL'):
    if 'username' in request.cookies:
        username = request.cookies['username']
        print(f"Ok, {username}, let's fetch the latest tweets!")
        c = AvroConsumer({
            'bootstrap.servers': BOOTSTRAP_SERVERS,
            'group.id': username,
            'schema.registry.url': SCHEMA_REGISTRY_URL,
            #'isolation.level': 'read_committed'
        })
        c.assign([TopicPartition(TOPIC, 0, 0)])
        low_offset, high_offset = c.get_watermark_offsets(
            TopicPartition(TOPIC, 0))
        #print(f"the latest offset is {high_offset}, the low is {low_offset}")

        # move consumer to offset=high_offset-WINDOW_LEN (only if > 0)
        if high_offset - WINDOW_LEN > 0:
            new_offset = high_offset - WINDOW_LEN
        else:
            new_offset = low_offset
        c.seek(TopicPartition(TOPIC, 0, new_offset))

        msgs = []  # to store the messages to be returned
        pos = c.position([TopicPartition(TOPIC, 0, new_offset)])
        while pos[0].offset < high_offset:
            try:
                msg = c.poll(0)

            except SerializerError as e:
                print("Message deserialization failed for {}: {}".format(
                    msg, e))
                break

            if msg is None:
                continue

            if msg.error():
                print("AvroConsumer error: {}".format(msg.error()))
                continue

            author = msg.value()['author']
            content = msg.value()['content']
            #kafka_timestamp = datetime.datetime.fromtimestamp(float(msg.timestamp()[1]/1000)).strftime('%H:%M:%S, %d-%m-%Y')
            timestamp = datetime.datetime.fromtimestamp(
                float(msg.value()['timestamp'])).strftime('%H:%M:%S, %d-%m-%Y')
            message_ts = float(msg.value()['timestamp'])
            location = msg.value()['location']
            tags = [h[1:] for h in content.split() if h.startswith('#')]
            mentions = [h[1:] for h in content.split() if h.startswith('@')]
            display_message = f"[{author}] {content} ({location} - {timestamp})"
            print(f"[{author}] {content} ({location} - {timestamp})")
            #print(f"consumer position: {c.position([TopicPartition(TOPIC, 0, new_offset)])}")
            pos = c.position([TopicPartition(TOPIC, 0, new_offset)])

            if cityfilter != 'ALL' and mentionfilter != 'ALL' and tagfilter != 'ALL':
                if (location.lower() == cityfilter) and (
                        mentionfilter.lower()
                        in mentions) and (tagfilter.lower() in tags):
                    msgs.append((display_message, message_ts))
            elif cityfilter == 'ALL' and mentionfilter != 'ALL' and tagfilter != 'ALL':
                if (mentionfilter.lower() in mentions) and (tagfilter.lower()
                                                            in tags):
                    msgs.append((display_message, message_ts))
            elif cityfilter != 'ALL' and mentionfilter == 'ALL' and tagfilter != 'ALL':
                if (location.lower() == cityfilter) and (tagfilter.lower()
                                                         in tags):
                    msgs.append((display_message, message_ts))
            elif cityfilter != 'ALL' and mentionfilter != 'ALL' and tagfilter == 'ALL':
                if (location.lower() == cityfilter) and (mentionfilter.lower()
                                                         in mentions):
                    msgs.append((display_message, message_ts))
            elif cityfilter != 'ALL' and mentionfilter == 'ALL' and tagfilter == 'ALL':
                if (location.lower() == cityfilter):
                    msgs.append((display_message, message_ts))
            elif cityfilter == 'ALL' and mentionfilter != 'ALL' and tagfilter == 'ALL':
                if (mentionfilter.lower() in mentions):
                    msgs.append((display_message, message_ts))
            elif cityfilter == 'ALL' and mentionfilter == 'ALL' and tagfilter != 'ALL':
                if (tagfilter.lower() in tags):
                    msgs.append((display_message, message_ts))
            else:
                msgs.append((display_message, message_ts))
        c.close()
        # finally return dictonary of messages
        msgs = list(
            set(msgs)
        )  # this is done to ensure that no duplicates of a message are shown in timeline
        msgs = sorted(msgs, key=lambda x: x[1])
        msgs = [m[0] for m in msgs]
        print(msgs)
        return {"results": msgs}
    else:
        return {"results": ['Oooops, your are not logged in...']}