Exemple #1
0
class KafkaWorker(BaseWorker):
    topic_name = None
    consumer_name = None
    consumer_settings = {}
    commit_on_complete = False
    async_commit = True
    poll_timeout = 0
    auto_offset_reset = 'earliest'
    consumer = None
    last_message = None

    def setup(self):
        self.consumer = AvroConsumer(self.get_consumer_settings())
        self.consumer.subscribe([self.get_topic_name()])

    def teardown(self):
        if self.consumer:
            self.consumer.close()

    def get_topic_name(self):
        return self.topic_name or utils.config_missing('topic name')

    def get_consumer_name(self):
        return self.consumer_name or utils.generate_random_consumer_name()

    def get_consumer_settings(self):
        default_settings = {
            'group.id': self.get_consumer_name(),
            'default.topic.config': {'auto.offset.reset': self.auto_offset_reset},
            'enable.auto.commit': False,
            'bootstrap.servers': utils.get_broker_url(),
            'schema.registry.url': utils.get_schema_registry_url(),
            'session.timeout.ms': 10000,
            'heartbeat.interval.ms': 1000,
            'api.version.request': True,
        }
        return utils.generate_client_settings(default_settings, self.consumer_settings)

    def poll(self):
        message = self.consumer.poll(timeout=self.poll_timeout)
        if message is not None:
            self.last_message = message
        return message

    def get_partitions(self):
        partitions = self.consumer.assignment()
        if not partitions:
            self.poll()
            partitions = self.consumer.assignment()
        return partitions

    def get_current_offsets(self):
        return self.consumer.position(self.get_partitions())

    def reset_consumer_offsets(self, offset):
        self.consumer.assign([TopicPartition(tp.topic, tp.partition, offset)
                              for tp in self.get_partitions()])

    def seek_to_timestamp(self, timestamp):
        timestamp_ms = dt_to_unix_ms(timestamp)
        partitions = self.get_partitions()
        for tp in partitions:
            tp.offset = timestamp_ms
        partitions = self.consumer.offsets_for_times(partitions)
        self.consumer.assign(partitions)

    def handle(self):
        message = self.poll()

        if message is None:
            self.wait()

        elif message.error():
            if message.error().code() == KafkaError._PARTITION_EOF:
                self.partition_eof(message)

            else:
                raise KafkaException(message.error())

        else:
            self._consume(message)

            if self.commit_on_complete:
                self.commit()

        self.done()

    def commit(self):
        if not self.consumer_settings.get('enable.auto.commit'):
            self.consumer.commit(asynchronous=self.async_commit)

    def _consume(self, message):
        self.consume_message(MessageValue(message))

    def consume_message(self, message):
        pass

    def partition_eof(self, message):
        pass
from confluent_kafka import KafkaError
from confluent_kafka import TopicPartition
from confluent_kafka.avro import AvroConsumer
from confluent_kafka.avro.serializer import SerializerError

tp = TopicPartition('pure_project_xml', 0, 0)
c = AvroConsumer({
    'bootstrap.servers': 'localhost:9092',
    'group.id': 'pure_project_output_generator',
    'schema.registry.url': 'http://localhost:8081',
})
c.assign([tp])
assignment = c.assignment()

# Need a timeout here due to this bug: https://github.com/confluentinc/confluent-kafka-python/issues/196
(first_offset, next_offset_to_create) = c.get_watermark_offsets(tp,
                                                                timeout=1,
                                                                cached=False)
last_offset = next_offset_to_create - 1

f = open('pure_project.xml', 'w')
f.write(
    '<?xml version="1.0"?>' + "\n" +
    '<project:upmprojects xmlns:common="v3.commons.pure.atira.dk" xmlns:project="v1.upmproject.pure.atira.dk">'
    + "\n")

# range values explained: We read the topic backwards, starting with the
# last offset. We use `first_offset - 1` because Python's range will stop
# before it reaches that value. So the last offset used will actually be
# the first offset. The last argument is the step, for which we pass -1,
# because we're reading backwards.
Exemple #3
0
c = AvroConsumer({
    'bootstrap.servers': 'kafka:9092',
    'group.id': 'consumers',
    'client.id': 'pysumer',
    'schema.registry.url': 'http://*****:*****@{:<4} - {:<6} - id: {:<4} subject: {:<7} price: {:<6}'.
                format(msg.topic(), msg.partition(), msg.offset(), msg.key(),
                       ad['id'], ad['subject'], ad['price']))
Exemple #4
0
class KafkaWorker(BaseWorker):
    topic_name = None
    consumer_name = None
    consumer_settings = {}
    commit_on_complete = False
    async_commit = True
    poll_timeout = 0
    auto_offset_reset = 'earliest'
    consumer = None
    last_message = None

    def setup(self):
        self.consumer = AvroConsumer(self.get_consumer_settings())
        self.consumer.subscribe([self.get_topic_name()])

    def teardown(self):
        if self.consumer:
            self.consumer.close()

    def get_topic_name(self):
        return self.topic_name or utils.config_missing('topic name')

    def get_consumer_name(self):
        return self.consumer_name or utils.generate_random_consumer_name()

    def get_consumer_settings(self):
        default_settings = {
            'group.id': self.get_consumer_name(),
            'default.topic.config': {'auto.offset.reset': self.auto_offset_reset},
            'enable.auto.commit': False,
            'bootstrap.servers': utils.get_broker_url(),
            'schema.registry.url': utils.get_schema_registry_url(),
            'session.timeout.ms': 10000,
            'heartbeat.interval.ms': 1000,
            'api.version.request': True,
        }
        return utils.generate_client_settings(default_settings, self.consumer_settings)

    def poll(self):
        message = self.consumer.poll(timeout=self.poll_timeout)
        if message is not None:
            self.last_message = message
        return message

    def get_partitions(self):
        partitions = self.consumer.assignment()
        if not partitions:
            self.poll()
            partitions = self.consumer.assignment()
        return partitions

    def get_current_offsets(self):
        return self.consumer.position(self.get_partitions())

    def reset_consumer_offsets(self, offset):
        self.consumer.assign([TopicPartition(tp.topic, tp.partition, offset)
                              for tp in self.get_partitions()])

    def seek_to_timestamp(self, timestamp):
        timestamp_ms = dt_to_unix_ms(timestamp)
        partitions = self.get_partitions()
        for tp in partitions:
            tp.offset = timestamp_ms
        partitions = self.consumer.offsets_for_times(partitions)
        self.consumer.assign(partitions)

    def handle(self):
        message = self.poll()

        if message is None:
            self.wait()

        elif message.error():
            if message.error().code() == KafkaError._PARTITION_EOF:
                self.partition_eof(message)

            else:
                raise KafkaException(message.error())

        else:
            self._consume(message)

            if self.commit_on_complete:
                self.commit()

        self.done()

    def commit(self):
        if not self.consumer_settings.get('enable.auto.commit'):
            self.consumer.commit(async=self.async_commit)

    def _consume(self, message):
        self.consume_message(MessageValue(message))

    def consume_message(self, message):
        pass

    def partition_eof(self, message):
        pass
Exemple #5
0
def main():

    # create Avro Producer for Kafka
    # Note that only schema registry has to be given here and deserialization of avro is handled automatically
    c = AvroConsumer({
        'bootstrap.servers': 'broker:29092',
        'group.id': 'anomalie_training',
        'schema.registry.url': 'http://schema-registry:8081'
    })

    # subscribe to topic
    c.subscribe(['anomalie_tutorial'])

    # We need to change Kafka offset in order to consume from beginning.
    # There is no straightforward way to archive this, so first message had to
    # be polled in order that assignment can be obtained. Afterwards assignment
    # (in form of topic partition) is changed by setting the offset to the beginning.
    msg = c.poll(10)
    topic_partition = c.assignment()

    for partition in topic_partition:
        partition.offset = OFFSET_BEGINNING

    c.assign(topic_partition)

    # Consume messages frop topic
    messages = []
    while True:
        msg = c.poll(1)

        if msg is None:
            break

        messages.append(msg.value())

    c.close()

    # transform messages to Pandas DataFrame and feature engineering
    df = pd.DataFrame(messages)
    df.timestamp = pd.to_datetime(df.timestamp * 1000000)

    df['hour'] = df.timestamp.dt.hour
    df['business_hour'] = ((df.hour < 8) | (df.hour > 18)).astype("int")
    df.drop(["hour"], axis=1, inplace=True)

    # train test split
    # note that we can not use sklearn.model_selection.train_test_split as this is a time series and random split is not an option!
    train_length = int(len(df) * 0.6)
    x_train = df.drop("timestamp", axis=1).iloc[:train_length, :]
    x_test = df.drop("timestamp", axis=1).iloc[train_length:, :]

    # Train Machine Learning Model, here Isolation Forests
    # contamination is import parameter. Determines how many datapoints will be classified as anomalous.
    iso_forest = IsolationForest(n_estimators=100, contamination=float(.02))
    iso_forest.fit(x_train)
    dump(iso_forest, '/data/iso_forest.joblib')

    # make predictions on test set
    predictions = iso_forest.predict(x_test)

    # make plot for evaluation and save figure
    evaluate_anomalies(predictions, df, train_length)