class KafkaWorker(BaseWorker): topic_name = None consumer_name = None consumer_settings = {} commit_on_complete = False async_commit = True poll_timeout = 0 auto_offset_reset = 'earliest' consumer = None last_message = None def setup(self): self.consumer = AvroConsumer(self.get_consumer_settings()) self.consumer.subscribe([self.get_topic_name()]) def teardown(self): if self.consumer: self.consumer.close() def get_topic_name(self): return self.topic_name or utils.config_missing('topic name') def get_consumer_name(self): return self.consumer_name or utils.generate_random_consumer_name() def get_consumer_settings(self): default_settings = { 'group.id': self.get_consumer_name(), 'default.topic.config': {'auto.offset.reset': self.auto_offset_reset}, 'enable.auto.commit': False, 'bootstrap.servers': utils.get_broker_url(), 'schema.registry.url': utils.get_schema_registry_url(), 'session.timeout.ms': 10000, 'heartbeat.interval.ms': 1000, 'api.version.request': True, } return utils.generate_client_settings(default_settings, self.consumer_settings) def poll(self): message = self.consumer.poll(timeout=self.poll_timeout) if message is not None: self.last_message = message return message def get_partitions(self): partitions = self.consumer.assignment() if not partitions: self.poll() partitions = self.consumer.assignment() return partitions def get_current_offsets(self): return self.consumer.position(self.get_partitions()) def reset_consumer_offsets(self, offset): self.consumer.assign([TopicPartition(tp.topic, tp.partition, offset) for tp in self.get_partitions()]) def seek_to_timestamp(self, timestamp): timestamp_ms = dt_to_unix_ms(timestamp) partitions = self.get_partitions() for tp in partitions: tp.offset = timestamp_ms partitions = self.consumer.offsets_for_times(partitions) self.consumer.assign(partitions) def handle(self): message = self.poll() if message is None: self.wait() elif message.error(): if message.error().code() == KafkaError._PARTITION_EOF: self.partition_eof(message) else: raise KafkaException(message.error()) else: self._consume(message) if self.commit_on_complete: self.commit() self.done() def commit(self): if not self.consumer_settings.get('enable.auto.commit'): self.consumer.commit(asynchronous=self.async_commit) def _consume(self, message): self.consume_message(MessageValue(message)) def consume_message(self, message): pass def partition_eof(self, message): pass
from confluent_kafka import KafkaError from confluent_kafka import TopicPartition from confluent_kafka.avro import AvroConsumer from confluent_kafka.avro.serializer import SerializerError tp = TopicPartition('pure_project_xml', 0, 0) c = AvroConsumer({ 'bootstrap.servers': 'localhost:9092', 'group.id': 'pure_project_output_generator', 'schema.registry.url': 'http://localhost:8081', }) c.assign([tp]) assignment = c.assignment() # Need a timeout here due to this bug: https://github.com/confluentinc/confluent-kafka-python/issues/196 (first_offset, next_offset_to_create) = c.get_watermark_offsets(tp, timeout=1, cached=False) last_offset = next_offset_to_create - 1 f = open('pure_project.xml', 'w') f.write( '<?xml version="1.0"?>' + "\n" + '<project:upmprojects xmlns:common="v3.commons.pure.atira.dk" xmlns:project="v1.upmproject.pure.atira.dk">' + "\n") # range values explained: We read the topic backwards, starting with the # last offset. We use `first_offset - 1` because Python's range will stop # before it reaches that value. So the last offset used will actually be # the first offset. The last argument is the step, for which we pass -1, # because we're reading backwards.
c = AvroConsumer({ 'bootstrap.servers': 'kafka:9092', 'group.id': 'consumers', 'client.id': 'pysumer', 'schema.registry.url': 'http://*****:*****@{:<4} - {:<6} - id: {:<4} subject: {:<7} price: {:<6}'. format(msg.topic(), msg.partition(), msg.offset(), msg.key(), ad['id'], ad['subject'], ad['price']))
class KafkaWorker(BaseWorker): topic_name = None consumer_name = None consumer_settings = {} commit_on_complete = False async_commit = True poll_timeout = 0 auto_offset_reset = 'earliest' consumer = None last_message = None def setup(self): self.consumer = AvroConsumer(self.get_consumer_settings()) self.consumer.subscribe([self.get_topic_name()]) def teardown(self): if self.consumer: self.consumer.close() def get_topic_name(self): return self.topic_name or utils.config_missing('topic name') def get_consumer_name(self): return self.consumer_name or utils.generate_random_consumer_name() def get_consumer_settings(self): default_settings = { 'group.id': self.get_consumer_name(), 'default.topic.config': {'auto.offset.reset': self.auto_offset_reset}, 'enable.auto.commit': False, 'bootstrap.servers': utils.get_broker_url(), 'schema.registry.url': utils.get_schema_registry_url(), 'session.timeout.ms': 10000, 'heartbeat.interval.ms': 1000, 'api.version.request': True, } return utils.generate_client_settings(default_settings, self.consumer_settings) def poll(self): message = self.consumer.poll(timeout=self.poll_timeout) if message is not None: self.last_message = message return message def get_partitions(self): partitions = self.consumer.assignment() if not partitions: self.poll() partitions = self.consumer.assignment() return partitions def get_current_offsets(self): return self.consumer.position(self.get_partitions()) def reset_consumer_offsets(self, offset): self.consumer.assign([TopicPartition(tp.topic, tp.partition, offset) for tp in self.get_partitions()]) def seek_to_timestamp(self, timestamp): timestamp_ms = dt_to_unix_ms(timestamp) partitions = self.get_partitions() for tp in partitions: tp.offset = timestamp_ms partitions = self.consumer.offsets_for_times(partitions) self.consumer.assign(partitions) def handle(self): message = self.poll() if message is None: self.wait() elif message.error(): if message.error().code() == KafkaError._PARTITION_EOF: self.partition_eof(message) else: raise KafkaException(message.error()) else: self._consume(message) if self.commit_on_complete: self.commit() self.done() def commit(self): if not self.consumer_settings.get('enable.auto.commit'): self.consumer.commit(async=self.async_commit) def _consume(self, message): self.consume_message(MessageValue(message)) def consume_message(self, message): pass def partition_eof(self, message): pass
def main(): # create Avro Producer for Kafka # Note that only schema registry has to be given here and deserialization of avro is handled automatically c = AvroConsumer({ 'bootstrap.servers': 'broker:29092', 'group.id': 'anomalie_training', 'schema.registry.url': 'http://schema-registry:8081' }) # subscribe to topic c.subscribe(['anomalie_tutorial']) # We need to change Kafka offset in order to consume from beginning. # There is no straightforward way to archive this, so first message had to # be polled in order that assignment can be obtained. Afterwards assignment # (in form of topic partition) is changed by setting the offset to the beginning. msg = c.poll(10) topic_partition = c.assignment() for partition in topic_partition: partition.offset = OFFSET_BEGINNING c.assign(topic_partition) # Consume messages frop topic messages = [] while True: msg = c.poll(1) if msg is None: break messages.append(msg.value()) c.close() # transform messages to Pandas DataFrame and feature engineering df = pd.DataFrame(messages) df.timestamp = pd.to_datetime(df.timestamp * 1000000) df['hour'] = df.timestamp.dt.hour df['business_hour'] = ((df.hour < 8) | (df.hour > 18)).astype("int") df.drop(["hour"], axis=1, inplace=True) # train test split # note that we can not use sklearn.model_selection.train_test_split as this is a time series and random split is not an option! train_length = int(len(df) * 0.6) x_train = df.drop("timestamp", axis=1).iloc[:train_length, :] x_test = df.drop("timestamp", axis=1).iloc[train_length:, :] # Train Machine Learning Model, here Isolation Forests # contamination is import parameter. Determines how many datapoints will be classified as anomalous. iso_forest = IsolationForest(n_estimators=100, contamination=float(.02)) iso_forest.fit(x_train) dump(iso_forest, '/data/iso_forest.joblib') # make predictions on test set predictions = iso_forest.predict(x_test) # make plot for evaluation and save figure evaluate_anomalies(predictions, df, train_length)