Ejemplo n.º 1
0
class AvroConsumerFacade:
    def __init__(self, name, emit_datum, broker, schema_registry_url, topic):
        self.name = name
        self.emit_datum = emit_datum
        self.consumer = AvroConsumer({
            'bootstrap.servers': broker,
            'group.id': name,
            'schema.registry.url': schema_registry_url,
            **get_sr_config_from_environment(),
            **get_kafka_config_from_environment(),
        })

        # Subscribe to topics/partitions, and seek to end. Following that we need
        # to poll until the topics have actually been assigned.
        def on_assign(consumer, partitions):
            for p in partitions:
                p.offset = OFFSET_END
            self.consumer.assign(partitions)

        self.consumer.subscribe([topic], on_assign=on_assign)
        self.consumer.poll(10)

    def consume_one(self, poll_wait=0):
        consumed_message = self.consumer.poll(poll_wait)
        if consumed_message is not None:
            self.emit_datum(Datum(good_count=1))
        else:
            self.emit_datum(Datum(bad_count=1))

    def close(self):
        self.consumer.commit()
        self.consumer.close()
Ejemplo n.º 2
0
def consume_record(args):
    default_group_name = "default-consumer-group"

    consumer_config = {
        "bootstrap.servers": args.bootstrap_servers,
        "schema.registry.url": args.schema_registry,
        "group.id": default_group_name,
        "auto.offset.reset": "earliest"
    }

    consumer = AvroConsumer(consumer_config)

    consumer.subscribe([args.topic])

    try:
        message = consumer.poll(5)
    except Exception as e:
        print(f"Exception while trying to poll messages - {e}")
    else:
        if message:
            print(
                f"Successfully poll a record from "
                f"Kafka topic: {message.topic()}, partition: {message.partition()}, offset: {message.offset()}\n"
                f"message key: {message.key()} || message value: {message.value()}"
            )
            consumer.commit()
        else:
            print("No new messages at this point. Try again later.")

    consumer.close()
Ejemplo n.º 3
0
class HttpCheckConsumer:
    """Consume Kafka messages"""
    def __init__(self, config: KafkaConfig):
        self.config = config
        self.consumer = AvroConsumer({
            "bootstrap.servers": KAFKA_BROKER,
            "group.id": "groupid",
            "schema.registry.url": KAFKA_SCHEMA_REGISTRY_URL,
            "auto.offset.reset": "smallest",
            "enable.auto.commit": False,
        })
        self.consumer.subscribe([KAFKA_TOPIC])

    @staticmethod
    def _process_message(message) -> HttpCheckResult:
        key = message.key()
        value = message.value()
        timestamp = datetime.datetime.fromisoformat(key["timestamp"])
        status_code = value["status_code"]
        matches_regex = value["matches_regex"]
        response_time_seconds = value["response_time_seconds"]
        return HttpCheckResult(
            status_code=status_code,
            timestamp=timestamp,
            matches_regex=matches_regex,
            response_time_seconds=response_time_seconds,
        )

    def consume(self) -> Generator[HttpCheckResult, None, None]:
        while True:
            try:
                msg = self.consumer.poll(1)
            except SerializerError as error:
                raise HttpCheckSerializerError(
                    f"Message deserialization failed: {error}")

            if msg is None:
                continue

            if msg.error():
                raise HttpCheckConsumerError("AvroConsumer error: {}".format(
                    msg.error()))

            logger.debug(f"Offset: {msg.offset()}")
            yield self._process_message(msg)

    def commit(self):
        self.consumer.commit()
    def printAndProduceMessages(self):
        consumer = AvroConsumer({
            'bootstrap.servers': BOOTSTRAP_SERVERS,
            'group.id': GROUP_ID,
            'auto.offset.reset': AUTO_OFFSET_RESET,
            'enable.auto.commit': False,
            'schema.registry.url': SCHEMA_REGISTRY_URL
        })
        schema_registry = CachedSchemaRegistryClient(
            os.environ.get('SCHEMA_REGISTRY', SCHEMA_REGISTRY_URL))
        avro_serde = AvroSerde(schema_registry)

        consumer.subscribe([INPUT_TOPIC_NAME])

        while True:
            try:
                consumedMessages = consumer.consume(
                    num_messages=CONSUMER_BATCH_SIZE, timeout=1)
            except Exception as e:
                logging.error("Message pool failed: {}".format(e))
                break

            messages = []
            for consumedMessage in consumedMessages:
                consumedMessageValue = avro_serde.decode_message(
                    consumedMessage.value())
                message = {}
                message["key"] = {}
                message["value"] = {}

                for attr, value in consumedMessageValue.items():
                    if attr != ARRAY_NAME:
                        message["value"][DOCUMENT_FIELD_PREFIX + attr] = value

                for arrayItem in consumedMessageValue[ARRAY_NAME]:
                    message["key"]["id"] = consumedMessageValue["id"] + \
                        "-" + arrayItem["id"]
                    for attr, value in arrayItem.items():
                        message["value"][attr] = value
                    messages.append(message)

            self.produceMessages(messages)
            consumer.commit()
        consumer.close()
def consumer(config, topic):
    config = dict(config, **{
        'group.id': 'test_group_35',
    })
    consumer = AvroConsumer(config)

    # Subscribe to topics/partitions, and seek to end. Following that we need
    # to poll until the topics have actually been assigned.
    def on_assign(consumer, partitions):
        for p in partitions:
            p.offset = OFFSET_END
        consumer.assign(partitions)

    consumer.subscribe([topic], on_assign=on_assign)
    consumer.poll(10)

    yield consumer

    consumer.commit()
    consumer.close()
Ejemplo n.º 6
0
    
    raw_messages = []

    # As we read messages pushed from producer to the consumer - classify: 
    while True:
        try:
            message = consumer.poll(5)
            print(f"Polled for message: {message}")
        except SerializerError as e:
            # print(f"Exception while trying to poll messages: {e}")
            print("Message deserialization failed for {}: {}".format(message, e))
            # break
        if message is not None:
            print(f"Successfully polled records from KAFKA TOPIC: {TRANSACTIONS_TOPIC}")
            print(f"Message Value: {message.value()}")
            consumer.commit()
            raw_messages.append(message.value())

            print(f"Raw Messages Length: {len(raw_messages)}")

            transaction: dict = message.value()
            topic = FRAUD_TOPIC if is_suspicious(transaction) else LEGIT_TOPIC
            # producer.send(topic, value=transaction)
            
            print(topic, transaction)  # DEBUG
            
            # Load to S3 Bucket:
            if len(raw_messages) > 0 and len(raw_messages) % 1000 == 0:
                utc_timestamp = datetime.utcnow().timestamp()
                upload_list_to_s3('kafka-fraud-detector', f'transactions_{utc_timestamp}.json', raw_messages)
                raw_messages = []
Ejemplo n.º 7
0
class Consumer:
    def __init__(self,
                 broker,
                 schema_registry,
                 topic=None,
                 logging_enabled=False,
                 group_id=None,
                 auto_commit=True):
        """
        Initialiser for Confluent Consumer using AvroConsumer. 
        Each consumer can only be subscribed to one topic 
        Parameters
        ----------
        broker: str
            The URL of the broker (example: 'localhost:9092')
        schema_registry: str
            The URL of the confluent Schema Registry endpoint (example: 'http://localhost:8081')
        topic: str
            The topic to subscribe too
        logger: Logger object, Optional
            The logger object which will be used to log messages if provided
        groupId: str, Optional
            An optional groupId which can be used to loadbalance consumers default is "asgard"
        """
        if group_id is None:
            new_hash = hashlib.sha1()
            new_hash.update(str(time.time()).encode("utf-8"))
            group_id = new_hash.hexdigest()

        self.__consumer = AvroConsumer({
            "bootstrap.servers": broker,
            "group.id": group_id,
            "schema.registry.url": schema_registry,
            "enable.auto.commit": auto_commit
        })
        self.__consumer_non_avro = KafkaConsumer({
            "bootstrap.servers":
            broker,
            "group.id":
            group_id + "0",
            "enable.auto.commit":
            auto_commit
        })
        self.auto_commit = auto_commit
        if not auto_commit:
            self.consumed_messages = PriorityQueue()
        if not topic is None:
            self.subscribe_to_topic(topic)
        else:
            self.topic = None
        if logging_enabled:
            self.logger = logging.getLogger(__name__)
        else:
            self.logger = None

    def consume(self, timeout=1):
        """
        Method to consume and return message if exists and can be deserialized
        Returns
        -------
        str
            The recieved message payload as a string
        None
            No message has been recieved or an error has occured
        """
        if not self.topic is None:
            msg = None
            non_avro = False
            try:
                msg = self.__consumer.poll(timeout)
            except SerializerError as e:
                try:
                    msg = self.__consumer_non_avro.poll(timeout)
                    non_avro = True
                except Exception as e:
                    self.__log_msg(
                        "Message deserialization has failed {}: {}".format(
                            msg, e),
                        "See the following stack trace",
                        f"{traceback.format_exc()}",
                        delimeter="\n",
                        level="ERROR")
            except RuntimeError as e:
                self.__log_msg(
                    "The consumer has been closed and cannot recieve messages",
                    level="ERROR")
            except Exception as e:
                self.__log_msg("An unkown error has occured {}".format(e),
                               "See the following stack trace",
                               f"{traceback.format_exc()}",
                               delimeter="\n",
                               level="ERROR")

            if not msg is None:
                if msg.error():
                    self.__log_msg("AvroConsumer error: {}".format(
                        msg.error()),
                                   level="ERROR")
                else:
                    if not self.auto_commit:
                        self.consumed_messages.put_nowait(msg)
                    if non_avro:
                        data_to_be_returned = json.loads(msg.value().decode())
                    else:
                        data_to_be_returned = msg.value()
                    return data_to_be_returned
        else:
            raise ValueError("Consumer is currently not subscribed to a topic")

    def __enter__(self):
        return self.__consumer

    def __exit__(self, *args):
        self.close()

    def __log_msg(
        self,
        *messages,
        level="NOTSET",
        delimeter=" ",
    ):
        levels = {
            "CRITICAL": logging.CRITICAL,
            "ERROR": logging.ERROR,
            "WARNING": logging.WARNING,
            "INFO": logging.INFO,
            "DEBUG": logging.DEBUG,
            "NOTSET": logging.NOTSET
        }
        msg = delimeter.join(messages)
        if self.logger is not None:
            if level not in levels:
                raise ValueError(
                    f"level {level} is not valid must be one of {list(levels.keys())}"
                )
            self.logger.log(levels[level], msg)
        else:
            if level is not None:
                print(f"LOGGED MESSAGE: {msg}")
            else:
                print(f"{level}: {msg}")

    def commit(self, asynchronous=True):
        if not self.auto_commit and not self.consumed_messages.empty():
            msg = self.consumed_messages.get_nowait()
            self.__consumer.commit(msg, asynchronous=asynchronous)

    def list_topics(self, topic=None, timeout=1):
        try:
            metadata = self.__consumer.list_topics(topic, timeout)
            topics = metadata.topics
            return list(topics.keys())
        except Exception as e:
            self.__log_msg(
                f"An unknown error has occured when trying to list topics {e}",
                "ERROR")
            self.logger.debug(e)

    def check_if_topic_exists(self, topic, timeout=1):
        topic_list = self.list_topics(timeout=timeout)
        if topic_list is not None:
            return topic in topic_list

    def subscribe_to_topic(self, topic):
        try:
            self.__consumer_non_avro.subscribe([topic],
                                               on_assign=self.__assign)
            self.__consumer.subscribe([topic], on_assign=self.__assign)
            self.topic = topic
            return True
        except Exception as e:
            self.__log_msg(
                "An unknown error {}".format(e),
                "occured while trying to subscribe to topic {}".format(topic),
                delimeter=" ",
                level="ERROR")
            return False

    def __assign(self, consumer, partitions):
        for p in partitions:
            p.offset = consumer.get_watermark_offsets(p)[1] - 1
        self.__consumer.assign(partitions)
        self.__consumer_non_avro.assign(partitions)

    def close(self):
        """
        Close the consumer, Once called this object cannot be reused
        """
        self.__consumer.close()
Ejemplo n.º 8
0
class KafkaWorker(BaseWorker):
    topic_name = None
    consumer_name = None
    consumer_settings = {}
    commit_on_complete = False
    async_commit = True
    poll_timeout = 0
    auto_offset_reset = 'earliest'
    consumer = None
    last_message = None

    def setup(self):
        self.consumer = AvroConsumer(self.get_consumer_settings())
        self.consumer.subscribe([self.get_topic_name()])

    def teardown(self):
        if self.consumer:
            self.consumer.close()

    def get_topic_name(self):
        return self.topic_name or utils.config_missing('topic name')

    def get_consumer_name(self):
        return self.consumer_name or utils.generate_random_consumer_name()

    def get_consumer_settings(self):
        default_settings = {
            'group.id': self.get_consumer_name(),
            'default.topic.config': {'auto.offset.reset': self.auto_offset_reset},
            'enable.auto.commit': False,
            'bootstrap.servers': utils.get_broker_url(),
            'schema.registry.url': utils.get_schema_registry_url(),
            'session.timeout.ms': 10000,
            'heartbeat.interval.ms': 1000,
            'api.version.request': True,
        }
        return utils.generate_client_settings(default_settings, self.consumer_settings)

    def poll(self):
        message = self.consumer.poll(timeout=self.poll_timeout)
        if message is not None:
            self.last_message = message
        return message

    def get_partitions(self):
        partitions = self.consumer.assignment()
        if not partitions:
            self.poll()
            partitions = self.consumer.assignment()
        return partitions

    def get_current_offsets(self):
        return self.consumer.position(self.get_partitions())

    def reset_consumer_offsets(self, offset):
        self.consumer.assign([TopicPartition(tp.topic, tp.partition, offset)
                              for tp in self.get_partitions()])

    def seek_to_timestamp(self, timestamp):
        timestamp_ms = dt_to_unix_ms(timestamp)
        partitions = self.get_partitions()
        for tp in partitions:
            tp.offset = timestamp_ms
        partitions = self.consumer.offsets_for_times(partitions)
        self.consumer.assign(partitions)

    def handle(self):
        message = self.poll()

        if message is None:
            self.wait()

        elif message.error():
            if message.error().code() == KafkaError._PARTITION_EOF:
                self.partition_eof(message)

            else:
                raise KafkaException(message.error())

        else:
            self._consume(message)

            if self.commit_on_complete:
                self.commit()

        self.done()

    def commit(self):
        if not self.consumer_settings.get('enable.auto.commit'):
            self.consumer.commit(asynchronous=self.async_commit)

    def _consume(self, message):
        self.consume_message(MessageValue(message))

    def consume_message(self, message):
        pass

    def partition_eof(self, message):
        pass
Ejemplo n.º 9
0
            if msg_value['op'] in ['c', 'u']:
                upsert_art(msg_value)
            elif msg_value['op'] == 'd':
                delete_art(msg_value)
        elif msg_value['source']['table'] == 'artizen':
            if msg_value['op'] in ['c', 'u']:
                upsert_artizen(msg_value)
            elif msg_value['op'] == 'd':
                delete_artizen(msg_value)
        elif msg_value['source']['table'] == 'archive':
            if msg_value['op'] in ['c', 'u', 'd']:
                update_relation(msg_value)
        elif msg_value['source']['table'] == 'text':
            if msg_value['op'] in ['c', 'u', 'd']:
                update_introduction(msg_value)
        c.commit(message=msg)
    except (TypeError, KeyError, json.decoder.JSONDecodeError) as e:
        print('Invalid message format: {}: {}'.format(msg_value, e),
              flush=True)
    except MySQLdb.Error as e:
        print('Error in MySQL operation: {}: {}'.format(msg_value, e),
              flush=True)
    except ElasticsearchException as e:
        print('Error in sending request to ElasticSearch: {}: {}'.format(
            msg_value, e),
              flush=True)
    except Exception as e:
        print('Uncaught exception: {}: {}'.format(msg_value, e), flush=True)
        c.close()
        raise e
Ejemplo n.º 10
0
class KafkaWorker(BaseWorker):
    topic_name = None
    consumer_name = None
    consumer_settings = {}
    commit_on_complete = False
    async_commit = True
    poll_timeout = 0
    auto_offset_reset = 'earliest'
    consumer = None
    last_message = None

    def setup(self):
        self.consumer = AvroConsumer(self.get_consumer_settings())
        self.consumer.subscribe([self.get_topic_name()])

    def teardown(self):
        if self.consumer:
            self.consumer.close()

    def get_topic_name(self):
        return self.topic_name or utils.config_missing('topic name')

    def get_consumer_name(self):
        return self.consumer_name or utils.generate_random_consumer_name()

    def get_consumer_settings(self):
        default_settings = {
            'group.id': self.get_consumer_name(),
            'default.topic.config': {'auto.offset.reset': self.auto_offset_reset},
            'enable.auto.commit': False,
            'bootstrap.servers': utils.get_broker_url(),
            'schema.registry.url': utils.get_schema_registry_url(),
            'session.timeout.ms': 10000,
            'heartbeat.interval.ms': 1000,
            'api.version.request': True,
        }
        return utils.generate_client_settings(default_settings, self.consumer_settings)

    def poll(self):
        message = self.consumer.poll(timeout=self.poll_timeout)
        if message is not None:
            self.last_message = message
        return message

    def get_partitions(self):
        partitions = self.consumer.assignment()
        if not partitions:
            self.poll()
            partitions = self.consumer.assignment()
        return partitions

    def get_current_offsets(self):
        return self.consumer.position(self.get_partitions())

    def reset_consumer_offsets(self, offset):
        self.consumer.assign([TopicPartition(tp.topic, tp.partition, offset)
                              for tp in self.get_partitions()])

    def seek_to_timestamp(self, timestamp):
        timestamp_ms = dt_to_unix_ms(timestamp)
        partitions = self.get_partitions()
        for tp in partitions:
            tp.offset = timestamp_ms
        partitions = self.consumer.offsets_for_times(partitions)
        self.consumer.assign(partitions)

    def handle(self):
        message = self.poll()

        if message is None:
            self.wait()

        elif message.error():
            if message.error().code() == KafkaError._PARTITION_EOF:
                self.partition_eof(message)

            else:
                raise KafkaException(message.error())

        else:
            self._consume(message)

            if self.commit_on_complete:
                self.commit()

        self.done()

    def commit(self):
        if not self.consumer_settings.get('enable.auto.commit'):
            self.consumer.commit(async=self.async_commit)

    def _consume(self, message):
        self.consume_message(MessageValue(message))

    def consume_message(self, message):
        pass

    def partition_eof(self, message):
        pass
c = AvroConsumer({
    'bootstrap.servers': '192.168.25.163:19092',
    'group.id': 'cgroudid-4',
    'schema.registry.url': 'http://192.168.25.163:7070',
    "api.version.request": True
})
c.subscribe(['job_entity'])
running = True
while running:
    msg = None
    try:
        msg = c.poll(10)
        print(msg)
        if msg:
            if not msg.error():
                print(msg.value())
                print(msg.key())
                print(msg.partition())
                print(msg.offset())
                c.commit(msg)
            elif msg.error().code() != KafkaError._PARTITION_EOF:
                print(msg.error())
                running = False
        else:
            print("No Message!! Happily trying again!!")
    except SerializerError as e:
        print("Message deserialization failed for %s: %s" % (msg, e))
        running = False
c.commit()
c.close()
Ejemplo n.º 12
0
class AvroConsumerApi(ABCMbApi):
    """
    This class implements the Interface for Kafka consumer carrying Avro messages.
    It is expected that the users would extend this class and override handle_message function.
    """
    def __init__(self,
                 *,
                 consumer_conf: dict,
                 key_schema_location,
                 value_schema_location: str,
                 topics: List[str],
                 batch_size: int = 5,
                 logger: logging.Logger = None,
                 sync: bool = False):
        super(AvroConsumerApi, self).__init__(logger=logger)

        self.key_schema = self.load_schema(schema_file=key_schema_location)
        self.value_schema = self.load_schema(schema_file=value_schema_location)

        self.consumer = AvroConsumer(consumer_conf,
                                     reader_key_schema=self.key_schema,
                                     reader_value_schema=self.value_schema)
        self.running = True
        self.topics = topics
        self.batch_size = batch_size
        self.sync = sync

    def shutdown(self):
        """
        Shutdown the consumer
        :return:
        """
        self.logger.debug("Trigger shutdown")
        self.running = False

    @staticmethod
    def _create_instance(*, module_name: str, class_name: str):
        module = importlib.import_module(module_name)
        class_ = getattr(module, class_name)
        return class_()

    @staticmethod
    def _create_instance_with_params(*, module_name: str, class_name: str):
        module = importlib.import_module(module_name)
        class_ = getattr(module, class_name)
        return class_

    def process_message(self, topic: str, key: dict, value: dict):
        """
        Process the incoming message. Must be overridden in the derived class
        :param topic: topic name
        :param key: incoming message key
        :param value: incoming message value
        :return:
        """
        self.logger.debug("KAFKA: Message received for topic " + topic)
        self.logger.debug("KAFKA: Key = {}".format(key))
        self.logger.debug("KAFKA: Value = {}".format(value))
        class_name = value.get('name', None) + 'Avro'
        self.logger.debug("KAFKA: class_name = {}".format(class_name))
        module_name = f"fabric_mb.message_bus.messages.{re.sub(r'(?<!^)(?=[A-Z])', '_', class_name).lower()}"
        self.logger.debug(f"KAFKA: module_name = {module_name}")

        message = self._create_instance(module_name=module_name,
                                        class_name=class_name)
        message.from_dict(value)

        self.handle_message(message=message)

    def handle_message(self, message: AbcMessageAvro):
        """
        Handle incoming message; must be overridden by the derived class
        :param message: incoming message
        """
        print(message)

    def consume(self):
        """
            Consume records unless shutdown triggered. Using synchronous commit after a message batch.
        """
        self.consumer.subscribe(self.topics)

        msg_count = 0
        while self.running:
            try:
                msg = self.consumer.poll(1)

                # There were no messages on the queue, continue polling
                if msg is None:
                    continue

                if msg.error():
                    self.logger.error(f"KAFKA: Consumer error: {msg.error()}")
                    if msg.error().code() == KafkaError._PARTITION_EOF:
                        # End of partition event
                        self.logger.error(
                            f"KAFKA: {msg.topic()} {msg.partition} reached end at offset [{msg.offset()}]"
                        )
                    elif msg.error():
                        self.logger.error(
                            f"KAFKA: Consumer error: {msg.error()}")
                        continue

                self.process_message(msg.topic(), msg.key(), msg.value())

                if self.sync:
                    msg_count += 1
                    if msg_count % self.batch_size == 0:
                        self.consumer.commit(asynchronous=False)

            except SerializerError as e:
                # Report malformed record, discard results, continue polling
                self.logger.error(f"KAFKA: Message deserialization failed {e}")
                continue
            except KeyboardInterrupt:
                break

        self.logger.debug("KAFKA: Shutting down consumer..")
        self.consumer.close()