class AvroConsumerFacade: def __init__(self, name, emit_datum, broker, schema_registry_url, topic): self.name = name self.emit_datum = emit_datum self.consumer = AvroConsumer({ 'bootstrap.servers': broker, 'group.id': name, 'schema.registry.url': schema_registry_url, **get_sr_config_from_environment(), **get_kafka_config_from_environment(), }) # Subscribe to topics/partitions, and seek to end. Following that we need # to poll until the topics have actually been assigned. def on_assign(consumer, partitions): for p in partitions: p.offset = OFFSET_END self.consumer.assign(partitions) self.consumer.subscribe([topic], on_assign=on_assign) self.consumer.poll(10) def consume_one(self, poll_wait=0): consumed_message = self.consumer.poll(poll_wait) if consumed_message is not None: self.emit_datum(Datum(good_count=1)) else: self.emit_datum(Datum(bad_count=1)) def close(self): self.consumer.commit() self.consumer.close()
def consume_record(args): default_group_name = "default-consumer-group" consumer_config = { "bootstrap.servers": args.bootstrap_servers, "schema.registry.url": args.schema_registry, "group.id": default_group_name, "auto.offset.reset": "earliest" } consumer = AvroConsumer(consumer_config) consumer.subscribe([args.topic]) try: message = consumer.poll(5) except Exception as e: print(f"Exception while trying to poll messages - {e}") else: if message: print( f"Successfully poll a record from " f"Kafka topic: {message.topic()}, partition: {message.partition()}, offset: {message.offset()}\n" f"message key: {message.key()} || message value: {message.value()}" ) consumer.commit() else: print("No new messages at this point. Try again later.") consumer.close()
class HttpCheckConsumer: """Consume Kafka messages""" def __init__(self, config: KafkaConfig): self.config = config self.consumer = AvroConsumer({ "bootstrap.servers": KAFKA_BROKER, "group.id": "groupid", "schema.registry.url": KAFKA_SCHEMA_REGISTRY_URL, "auto.offset.reset": "smallest", "enable.auto.commit": False, }) self.consumer.subscribe([KAFKA_TOPIC]) @staticmethod def _process_message(message) -> HttpCheckResult: key = message.key() value = message.value() timestamp = datetime.datetime.fromisoformat(key["timestamp"]) status_code = value["status_code"] matches_regex = value["matches_regex"] response_time_seconds = value["response_time_seconds"] return HttpCheckResult( status_code=status_code, timestamp=timestamp, matches_regex=matches_regex, response_time_seconds=response_time_seconds, ) def consume(self) -> Generator[HttpCheckResult, None, None]: while True: try: msg = self.consumer.poll(1) except SerializerError as error: raise HttpCheckSerializerError( f"Message deserialization failed: {error}") if msg is None: continue if msg.error(): raise HttpCheckConsumerError("AvroConsumer error: {}".format( msg.error())) logger.debug(f"Offset: {msg.offset()}") yield self._process_message(msg) def commit(self): self.consumer.commit()
def printAndProduceMessages(self): consumer = AvroConsumer({ 'bootstrap.servers': BOOTSTRAP_SERVERS, 'group.id': GROUP_ID, 'auto.offset.reset': AUTO_OFFSET_RESET, 'enable.auto.commit': False, 'schema.registry.url': SCHEMA_REGISTRY_URL }) schema_registry = CachedSchemaRegistryClient( os.environ.get('SCHEMA_REGISTRY', SCHEMA_REGISTRY_URL)) avro_serde = AvroSerde(schema_registry) consumer.subscribe([INPUT_TOPIC_NAME]) while True: try: consumedMessages = consumer.consume( num_messages=CONSUMER_BATCH_SIZE, timeout=1) except Exception as e: logging.error("Message pool failed: {}".format(e)) break messages = [] for consumedMessage in consumedMessages: consumedMessageValue = avro_serde.decode_message( consumedMessage.value()) message = {} message["key"] = {} message["value"] = {} for attr, value in consumedMessageValue.items(): if attr != ARRAY_NAME: message["value"][DOCUMENT_FIELD_PREFIX + attr] = value for arrayItem in consumedMessageValue[ARRAY_NAME]: message["key"]["id"] = consumedMessageValue["id"] + \ "-" + arrayItem["id"] for attr, value in arrayItem.items(): message["value"][attr] = value messages.append(message) self.produceMessages(messages) consumer.commit() consumer.close()
def consumer(config, topic): config = dict(config, **{ 'group.id': 'test_group_35', }) consumer = AvroConsumer(config) # Subscribe to topics/partitions, and seek to end. Following that we need # to poll until the topics have actually been assigned. def on_assign(consumer, partitions): for p in partitions: p.offset = OFFSET_END consumer.assign(partitions) consumer.subscribe([topic], on_assign=on_assign) consumer.poll(10) yield consumer consumer.commit() consumer.close()
raw_messages = [] # As we read messages pushed from producer to the consumer - classify: while True: try: message = consumer.poll(5) print(f"Polled for message: {message}") except SerializerError as e: # print(f"Exception while trying to poll messages: {e}") print("Message deserialization failed for {}: {}".format(message, e)) # break if message is not None: print(f"Successfully polled records from KAFKA TOPIC: {TRANSACTIONS_TOPIC}") print(f"Message Value: {message.value()}") consumer.commit() raw_messages.append(message.value()) print(f"Raw Messages Length: {len(raw_messages)}") transaction: dict = message.value() topic = FRAUD_TOPIC if is_suspicious(transaction) else LEGIT_TOPIC # producer.send(topic, value=transaction) print(topic, transaction) # DEBUG # Load to S3 Bucket: if len(raw_messages) > 0 and len(raw_messages) % 1000 == 0: utc_timestamp = datetime.utcnow().timestamp() upload_list_to_s3('kafka-fraud-detector', f'transactions_{utc_timestamp}.json', raw_messages) raw_messages = []
class Consumer: def __init__(self, broker, schema_registry, topic=None, logging_enabled=False, group_id=None, auto_commit=True): """ Initialiser for Confluent Consumer using AvroConsumer. Each consumer can only be subscribed to one topic Parameters ---------- broker: str The URL of the broker (example: 'localhost:9092') schema_registry: str The URL of the confluent Schema Registry endpoint (example: 'http://localhost:8081') topic: str The topic to subscribe too logger: Logger object, Optional The logger object which will be used to log messages if provided groupId: str, Optional An optional groupId which can be used to loadbalance consumers default is "asgard" """ if group_id is None: new_hash = hashlib.sha1() new_hash.update(str(time.time()).encode("utf-8")) group_id = new_hash.hexdigest() self.__consumer = AvroConsumer({ "bootstrap.servers": broker, "group.id": group_id, "schema.registry.url": schema_registry, "enable.auto.commit": auto_commit }) self.__consumer_non_avro = KafkaConsumer({ "bootstrap.servers": broker, "group.id": group_id + "0", "enable.auto.commit": auto_commit }) self.auto_commit = auto_commit if not auto_commit: self.consumed_messages = PriorityQueue() if not topic is None: self.subscribe_to_topic(topic) else: self.topic = None if logging_enabled: self.logger = logging.getLogger(__name__) else: self.logger = None def consume(self, timeout=1): """ Method to consume and return message if exists and can be deserialized Returns ------- str The recieved message payload as a string None No message has been recieved or an error has occured """ if not self.topic is None: msg = None non_avro = False try: msg = self.__consumer.poll(timeout) except SerializerError as e: try: msg = self.__consumer_non_avro.poll(timeout) non_avro = True except Exception as e: self.__log_msg( "Message deserialization has failed {}: {}".format( msg, e), "See the following stack trace", f"{traceback.format_exc()}", delimeter="\n", level="ERROR") except RuntimeError as e: self.__log_msg( "The consumer has been closed and cannot recieve messages", level="ERROR") except Exception as e: self.__log_msg("An unkown error has occured {}".format(e), "See the following stack trace", f"{traceback.format_exc()}", delimeter="\n", level="ERROR") if not msg is None: if msg.error(): self.__log_msg("AvroConsumer error: {}".format( msg.error()), level="ERROR") else: if not self.auto_commit: self.consumed_messages.put_nowait(msg) if non_avro: data_to_be_returned = json.loads(msg.value().decode()) else: data_to_be_returned = msg.value() return data_to_be_returned else: raise ValueError("Consumer is currently not subscribed to a topic") def __enter__(self): return self.__consumer def __exit__(self, *args): self.close() def __log_msg( self, *messages, level="NOTSET", delimeter=" ", ): levels = { "CRITICAL": logging.CRITICAL, "ERROR": logging.ERROR, "WARNING": logging.WARNING, "INFO": logging.INFO, "DEBUG": logging.DEBUG, "NOTSET": logging.NOTSET } msg = delimeter.join(messages) if self.logger is not None: if level not in levels: raise ValueError( f"level {level} is not valid must be one of {list(levels.keys())}" ) self.logger.log(levels[level], msg) else: if level is not None: print(f"LOGGED MESSAGE: {msg}") else: print(f"{level}: {msg}") def commit(self, asynchronous=True): if not self.auto_commit and not self.consumed_messages.empty(): msg = self.consumed_messages.get_nowait() self.__consumer.commit(msg, asynchronous=asynchronous) def list_topics(self, topic=None, timeout=1): try: metadata = self.__consumer.list_topics(topic, timeout) topics = metadata.topics return list(topics.keys()) except Exception as e: self.__log_msg( f"An unknown error has occured when trying to list topics {e}", "ERROR") self.logger.debug(e) def check_if_topic_exists(self, topic, timeout=1): topic_list = self.list_topics(timeout=timeout) if topic_list is not None: return topic in topic_list def subscribe_to_topic(self, topic): try: self.__consumer_non_avro.subscribe([topic], on_assign=self.__assign) self.__consumer.subscribe([topic], on_assign=self.__assign) self.topic = topic return True except Exception as e: self.__log_msg( "An unknown error {}".format(e), "occured while trying to subscribe to topic {}".format(topic), delimeter=" ", level="ERROR") return False def __assign(self, consumer, partitions): for p in partitions: p.offset = consumer.get_watermark_offsets(p)[1] - 1 self.__consumer.assign(partitions) self.__consumer_non_avro.assign(partitions) def close(self): """ Close the consumer, Once called this object cannot be reused """ self.__consumer.close()
class KafkaWorker(BaseWorker): topic_name = None consumer_name = None consumer_settings = {} commit_on_complete = False async_commit = True poll_timeout = 0 auto_offset_reset = 'earliest' consumer = None last_message = None def setup(self): self.consumer = AvroConsumer(self.get_consumer_settings()) self.consumer.subscribe([self.get_topic_name()]) def teardown(self): if self.consumer: self.consumer.close() def get_topic_name(self): return self.topic_name or utils.config_missing('topic name') def get_consumer_name(self): return self.consumer_name or utils.generate_random_consumer_name() def get_consumer_settings(self): default_settings = { 'group.id': self.get_consumer_name(), 'default.topic.config': {'auto.offset.reset': self.auto_offset_reset}, 'enable.auto.commit': False, 'bootstrap.servers': utils.get_broker_url(), 'schema.registry.url': utils.get_schema_registry_url(), 'session.timeout.ms': 10000, 'heartbeat.interval.ms': 1000, 'api.version.request': True, } return utils.generate_client_settings(default_settings, self.consumer_settings) def poll(self): message = self.consumer.poll(timeout=self.poll_timeout) if message is not None: self.last_message = message return message def get_partitions(self): partitions = self.consumer.assignment() if not partitions: self.poll() partitions = self.consumer.assignment() return partitions def get_current_offsets(self): return self.consumer.position(self.get_partitions()) def reset_consumer_offsets(self, offset): self.consumer.assign([TopicPartition(tp.topic, tp.partition, offset) for tp in self.get_partitions()]) def seek_to_timestamp(self, timestamp): timestamp_ms = dt_to_unix_ms(timestamp) partitions = self.get_partitions() for tp in partitions: tp.offset = timestamp_ms partitions = self.consumer.offsets_for_times(partitions) self.consumer.assign(partitions) def handle(self): message = self.poll() if message is None: self.wait() elif message.error(): if message.error().code() == KafkaError._PARTITION_EOF: self.partition_eof(message) else: raise KafkaException(message.error()) else: self._consume(message) if self.commit_on_complete: self.commit() self.done() def commit(self): if not self.consumer_settings.get('enable.auto.commit'): self.consumer.commit(asynchronous=self.async_commit) def _consume(self, message): self.consume_message(MessageValue(message)) def consume_message(self, message): pass def partition_eof(self, message): pass
if msg_value['op'] in ['c', 'u']: upsert_art(msg_value) elif msg_value['op'] == 'd': delete_art(msg_value) elif msg_value['source']['table'] == 'artizen': if msg_value['op'] in ['c', 'u']: upsert_artizen(msg_value) elif msg_value['op'] == 'd': delete_artizen(msg_value) elif msg_value['source']['table'] == 'archive': if msg_value['op'] in ['c', 'u', 'd']: update_relation(msg_value) elif msg_value['source']['table'] == 'text': if msg_value['op'] in ['c', 'u', 'd']: update_introduction(msg_value) c.commit(message=msg) except (TypeError, KeyError, json.decoder.JSONDecodeError) as e: print('Invalid message format: {}: {}'.format(msg_value, e), flush=True) except MySQLdb.Error as e: print('Error in MySQL operation: {}: {}'.format(msg_value, e), flush=True) except ElasticsearchException as e: print('Error in sending request to ElasticSearch: {}: {}'.format( msg_value, e), flush=True) except Exception as e: print('Uncaught exception: {}: {}'.format(msg_value, e), flush=True) c.close() raise e
class KafkaWorker(BaseWorker): topic_name = None consumer_name = None consumer_settings = {} commit_on_complete = False async_commit = True poll_timeout = 0 auto_offset_reset = 'earliest' consumer = None last_message = None def setup(self): self.consumer = AvroConsumer(self.get_consumer_settings()) self.consumer.subscribe([self.get_topic_name()]) def teardown(self): if self.consumer: self.consumer.close() def get_topic_name(self): return self.topic_name or utils.config_missing('topic name') def get_consumer_name(self): return self.consumer_name or utils.generate_random_consumer_name() def get_consumer_settings(self): default_settings = { 'group.id': self.get_consumer_name(), 'default.topic.config': {'auto.offset.reset': self.auto_offset_reset}, 'enable.auto.commit': False, 'bootstrap.servers': utils.get_broker_url(), 'schema.registry.url': utils.get_schema_registry_url(), 'session.timeout.ms': 10000, 'heartbeat.interval.ms': 1000, 'api.version.request': True, } return utils.generate_client_settings(default_settings, self.consumer_settings) def poll(self): message = self.consumer.poll(timeout=self.poll_timeout) if message is not None: self.last_message = message return message def get_partitions(self): partitions = self.consumer.assignment() if not partitions: self.poll() partitions = self.consumer.assignment() return partitions def get_current_offsets(self): return self.consumer.position(self.get_partitions()) def reset_consumer_offsets(self, offset): self.consumer.assign([TopicPartition(tp.topic, tp.partition, offset) for tp in self.get_partitions()]) def seek_to_timestamp(self, timestamp): timestamp_ms = dt_to_unix_ms(timestamp) partitions = self.get_partitions() for tp in partitions: tp.offset = timestamp_ms partitions = self.consumer.offsets_for_times(partitions) self.consumer.assign(partitions) def handle(self): message = self.poll() if message is None: self.wait() elif message.error(): if message.error().code() == KafkaError._PARTITION_EOF: self.partition_eof(message) else: raise KafkaException(message.error()) else: self._consume(message) if self.commit_on_complete: self.commit() self.done() def commit(self): if not self.consumer_settings.get('enable.auto.commit'): self.consumer.commit(async=self.async_commit) def _consume(self, message): self.consume_message(MessageValue(message)) def consume_message(self, message): pass def partition_eof(self, message): pass
c = AvroConsumer({ 'bootstrap.servers': '192.168.25.163:19092', 'group.id': 'cgroudid-4', 'schema.registry.url': 'http://192.168.25.163:7070', "api.version.request": True }) c.subscribe(['job_entity']) running = True while running: msg = None try: msg = c.poll(10) print(msg) if msg: if not msg.error(): print(msg.value()) print(msg.key()) print(msg.partition()) print(msg.offset()) c.commit(msg) elif msg.error().code() != KafkaError._PARTITION_EOF: print(msg.error()) running = False else: print("No Message!! Happily trying again!!") except SerializerError as e: print("Message deserialization failed for %s: %s" % (msg, e)) running = False c.commit() c.close()
class AvroConsumerApi(ABCMbApi): """ This class implements the Interface for Kafka consumer carrying Avro messages. It is expected that the users would extend this class and override handle_message function. """ def __init__(self, *, consumer_conf: dict, key_schema_location, value_schema_location: str, topics: List[str], batch_size: int = 5, logger: logging.Logger = None, sync: bool = False): super(AvroConsumerApi, self).__init__(logger=logger) self.key_schema = self.load_schema(schema_file=key_schema_location) self.value_schema = self.load_schema(schema_file=value_schema_location) self.consumer = AvroConsumer(consumer_conf, reader_key_schema=self.key_schema, reader_value_schema=self.value_schema) self.running = True self.topics = topics self.batch_size = batch_size self.sync = sync def shutdown(self): """ Shutdown the consumer :return: """ self.logger.debug("Trigger shutdown") self.running = False @staticmethod def _create_instance(*, module_name: str, class_name: str): module = importlib.import_module(module_name) class_ = getattr(module, class_name) return class_() @staticmethod def _create_instance_with_params(*, module_name: str, class_name: str): module = importlib.import_module(module_name) class_ = getattr(module, class_name) return class_ def process_message(self, topic: str, key: dict, value: dict): """ Process the incoming message. Must be overridden in the derived class :param topic: topic name :param key: incoming message key :param value: incoming message value :return: """ self.logger.debug("KAFKA: Message received for topic " + topic) self.logger.debug("KAFKA: Key = {}".format(key)) self.logger.debug("KAFKA: Value = {}".format(value)) class_name = value.get('name', None) + 'Avro' self.logger.debug("KAFKA: class_name = {}".format(class_name)) module_name = f"fabric_mb.message_bus.messages.{re.sub(r'(?<!^)(?=[A-Z])', '_', class_name).lower()}" self.logger.debug(f"KAFKA: module_name = {module_name}") message = self._create_instance(module_name=module_name, class_name=class_name) message.from_dict(value) self.handle_message(message=message) def handle_message(self, message: AbcMessageAvro): """ Handle incoming message; must be overridden by the derived class :param message: incoming message """ print(message) def consume(self): """ Consume records unless shutdown triggered. Using synchronous commit after a message batch. """ self.consumer.subscribe(self.topics) msg_count = 0 while self.running: try: msg = self.consumer.poll(1) # There were no messages on the queue, continue polling if msg is None: continue if msg.error(): self.logger.error(f"KAFKA: Consumer error: {msg.error()}") if msg.error().code() == KafkaError._PARTITION_EOF: # End of partition event self.logger.error( f"KAFKA: {msg.topic()} {msg.partition} reached end at offset [{msg.offset()}]" ) elif msg.error(): self.logger.error( f"KAFKA: Consumer error: {msg.error()}") continue self.process_message(msg.topic(), msg.key(), msg.value()) if self.sync: msg_count += 1 if msg_count % self.batch_size == 0: self.consumer.commit(asynchronous=False) except SerializerError as e: # Report malformed record, discard results, continue polling self.logger.error(f"KAFKA: Message deserialization failed {e}") continue except KeyboardInterrupt: break self.logger.debug("KAFKA: Shutting down consumer..") self.consumer.close()