Exemple #1
0
def main(args):
    topic = args.topic

    schema_registry_conf = {'url': args.schema_registry}
    schema_registry_client = SchemaRegistryClient(schema_registry_conf)
    schema_obj = schema_registry_client.get_latest_version(
        subject_name='example_serde_json-value')

    json_deserializer = JSONDeserializer(schema_obj.schema.schema_str,
                                         from_dict=dict_to_user)
    string_deserializer = StringDeserializer('utf_8')

    consumer_conf = {
        'bootstrap.servers': args.bootstrap_servers,
        'key.deserializer': string_deserializer,
        'value.deserializer': json_deserializer,
        'group.id': args.group,
        'auto.offset.reset': "earliest"
    }

    consumer = DeserializingConsumer(consumer_conf)
    consumer.subscribe([topic])

    while True:
        try:
            msg = consumer.poll(1.0)
            if msg is None:
                continue

            user = msg.value()
            if user is not None:
                print(f"User record {msg.key()}:\n name: {user.name}\n"
                      f"\tfavorite_number: {user.favorite_color}\n"
                      f"\tfavorite_color: {user.favorite_number}\n")
        except KeyboardInterrupt:
            break
    consumer.close()
class KafkaAvroConsumer:

    def __init__(self, consumer_name, value_schema, topic_name = "kafka-avro-producer", groupID = 'KafkaAvroConsumer', autocommit = True):

        # Consumer name for logging purposes
        self.logging_prefix = '['+ consumer_name + '][KafkaAvroConsumer]'

        # Schema Registry configuration
        self.schema_registry_conf = EventBackboneConfig.getSchemaRegistryConf()
        # Schema Registry Client
        self.schema_registry_client = SchemaRegistryClient(self.schema_registry_conf)
 
 
        # Key Deserializer
        self.key_deserializer = StringDeserializer('utf_8')

         # Get Schema for the value
        self.schema_id_value = self.schema_registry_client.get_latest_version(topic_name + "-value").schema_id
        # print('The Schema ID for the value is: {}'.format(self.schema_id_value))
        self.value_schema = self.schema_registry_client.get_schema(self.schema_id_value).schema_str
        print(self.logging_prefix + ' - Value Subject: {}'.format(topic_name))
        print(self.logging_prefix + ' - Value Schema:')
        print(self.logging_prefix + ' - -------------\n')
        print(self.logging_prefix + ' - ' + self.value_schema + '\n')

        # Value Deserializer
        # Presenting the schema to the Avro Deserializer is needed at the moment. In the future it might change
        # https://github.com/confluentinc/confluent-kafka-python/issues/834
        self.value_deserializer = AvroDeserializer(self.value_schema,self.schema_registry_client)

        # Get the consumer configuration
        self.consumer_conf = EventBackboneConfig.getConsumerConfiguration(groupID, autocommit, 
                                                                        self.key_deserializer,
                                                                        self.value_deserializer)
        # Create the consumer
        self.consumer = DeserializingConsumer(self.consumer_conf)

        # Print consumer configuration
        EventBackboneConfig.printConsumerConfiguration(self.logging_prefix,self.consumer_conf,self.schema_registry_conf['url'])

        # Subscribe to the topic
        self.consumer.subscribe([topic_name])
    
    def traceResponse(self, msg):
        print(self.logging_prefix + ' - New event received\n\tTopic: {}\n\tPartition: {}\n\tOffset: {}\n\tkey: {}\n\tvalue: {}\n'
                    .format(msg.topic(), msg.partition(), msg.offset(), msg.key(), msg.value()))

    # Polls for next event
    def pollNextEvent(self):
        # Poll for messages
        msg = self.consumer.poll(timeout=POLL_TIMEOUT)
        anEvent = {}
        # Validate the returned message
        if msg is None:
            print(self.logging_prefix + ' - [INFO] - No new messages on the topic')
            return None
        elif msg.error():
            if ("PARTITION_EOF" in msg.error()):
                print(self.logging_prefix + ' - [INFO] - End of partition')
            else:
                print(self.logging_prefix + ' - [ERROR] - Consumer error: {}'.format(msg.error()))
            return None
        else:
            # Print the message
            self.traceResponse(msg)
        return msg.value()

   
    
    # Polls for the next event but returns the raw event
    def pollNextRawEvent(self):
        records = self.consumer.poll(timeout=POLL_TIMEOUT)
        if records is None:
            return None
        if records.error():
            # Stop reading if we find end of partition in the error message
            if ("PARTITION_EOF" in records.error()):
                return None
            else:
                print(self.logging_prefix + ' - [ERROR] - Consumer error: {}'.format(records.error()))
                return None
        else:
            self.traceResponse(records)
        return records


    def commitEvent(self,event):
        self.consumer.commit(event)

    def close(self):
        self.consumer.close()
Exemple #3
0
schema_registry_conf = {
    'url':
    schemaRegistryUrl,
    'basic.auth.user.info':
    '{}:{}'.format(confluentRegistryApiKey, confluentRegistrySecret)
}

schema_registry_client = SchemaRegistryClient(schema_registry_conf)

# COMMAND ----------

import pyspark.sql.functions as fn
from pyspark.sql.avro.functions import from_avro

keyRestResponseSchema = schema_registry_client.get_latest_version(
    confluentTopicName + "-key").schema
confluentKeySchema = keyRestResponseSchema.schema_str
valueRestResponseSchema = schema_registry_client.get_latest_version(
    confluentTopicName + "-value").schema
confluentValueSchema = valueRestResponseSchema.schema_str

# Set the option for how to fail - either stop on the first failure it finds (FAILFAST) or just set corrupt data to null (PERMISSIVE)
#fromAvroOptions = {"mode":"FAILFAST"}
fromAvroOptions = {"mode": "PERMISSIVE"}

AvroDF = (spark.readStream.format("kafka").option(
    "kafka.bootstrap.servers", confluentBootstrapServers
).option("kafka.security.protocol", "SASL_SSL").option(
    "kafka.sasl.jaas.config",
    "kafkashaded.org.apache.kafka.common.security.plain.PlainLoginModule required username='******' password='******';"
    .format(confluentApiKey, confluentSecret)).option(
Exemple #4
0
def _get_schema(schema_registry_client: SchemaRegistryClient,
                topic: str) -> str:
    """Return a schema string from an AVRO server."""
    return schema_registry_client.get_latest_version(topic).schema.schema_str
Exemple #5
0
class KafkaPC:
    def __init__(self, config_path, config_section):
        super(KafkaPC, self).__init__()

        self.in_topic = None
        self.out_topic = None
        self.in_schema = None
        self.out_schema = None

        self.read_config(config_path, config_section)
        self.connect_schema_registry()
        self.read_topics()
        self.create_topics_on_broker()
        self.register_schemas_in_registry()
        self.create_serializer()
        self.create_deserializer()
        self.create_consumer()
        self.create_producer()

    def connect_schema_registry(self):
        MAX_RETRIES = 3

        if self.config.get("KAFKA_SCHEMA_REGISTRY_URL") is not None:
            sr_conf = {"url": self.config["KAFKA_SCHEMA_REGISTRY_URL"]}

            retries = 0
            while retries < MAX_RETRIES:
                try:
                    self.schema_registry = SchemaRegistryClient(sr_conf)
                    print("Connected to Schema Registry")
                    break
                except Exception as e:
                    retries += 1
                    print(
                        f"Could not connect to Schema Registry, retry {retries}"
                    )
                    print({repr(e)})
                    sleep(5)
            if retries == MAX_RETRIES:
                raise ConnectionError("Could not connect to Schema Registry")
        else:
            raise ValueError("Need KAFKA_SCHEMA_REGISTRY_URL")

    def register_schemas_in_registry(self, suffix="-value"):
        MAX_RETRIES = 3

        for topic, schema in self.out_schema.items():
            subject = topic + suffix
            retries = 0
            while retries < MAX_RETRIES:
                try:
                    self.schema_registry.register_schema(subject_name=subject,
                                                         schema=schema)
                    print(f"Registered schema for topic {topic} in registry")
                    break
                except Exception as e:
                    retries += 1
                    print(
                        f"Could not register schema for topic {topic} in registry: {repr(e)}"
                    )
                    sleep(5)
            if retries == MAX_RETRIES:
                raise ConnectionError("Could not connect to Schema Registry")

    def create_topics_on_broker(self, partitions=1, replication=1):
        a = AdminClient({"bootstrap.servers": self.config["KAFKA_BROKER_URL"]})

        topic_set = set(self.out_topic)

        md = a.list_topics(timeout=10)
        broker_set = set(md.topics.values())
        diff_set = topic_set.difference(broker_set)
        new_topics = [
            NewTopic(topic,
                     num_partitions=partitions,
                     replication_factor=replication) for topic in diff_set
        ]

        fs = a.create_topics(new_topics)

        # Wait for operation to finish.
        # Timeouts are preferably controlled by passing request_timeout=15.0
        # to the create_topics() call.
        # All futures will finish at the same time.
        for topic, f in fs.items():
            try:
                f.result()  # The result itself is None
                print(f"Topic {topic} created on Broker")
            except Exception as e:
                print(f"Failed to create topic {topic} on Broker: {repr(e)}")

    def get_schema_from_registry(self, topic, suffix="-value"):
        response = None

        MAX_RETRIES = 3
        retries = 0
        while retries < MAX_RETRIES:

            try:
                schema = self.schema_registry.get_latest_version(topic +
                                                                 suffix)
                response = schema.schema
                print(f"Retrieved schema for topic {topic} from Registry")
                break
            except Exception as e:
                retries += 1
                print(f"Failed to get schema: {repr(e)}")
                sleep(3)
        return response

    def read_topics(self):

        if self.config.get("IN_TOPIC") and self.config.get("IN_GROUP"):
            self.in_topic = self.config["IN_TOPIC"]

            self.in_schema = {}
            for topic in self.in_topic:
                # try to get schema from registry
                schema = self.get_schema_from_registry(topic)
                # if no schema is found a simple string deserializer will be used, see line 87
                if schema is None:
                    self.in_schema[topic] = None
                else:
                    self.in_schema[topic] = schema

        if self.config.get("OUT_TOPIC"):
            self.out_topic = list(self.config["OUT_TOPIC"].keys())
            self.out_schema = {}
            for topic, schema in self.config["OUT_TOPIC"].items():
                self.out_schema[topic] = self.read_avro_schema(schema)

    def create_serializer(self):
        self.serializer = {}
        if self.out_topic is not None:
            for topic in self.out_topic:
                schema_str = self.out_schema[topic].schema_str
                self.serializer[topic] = AvroSerializer(
                    schema_str, self.schema_registry)

    def create_deserializer(self):
        self.deserializer = {}
        if self.in_topic is not None:
            for topic in self.in_topic:
                if self.in_schema[topic] is None:
                    self.deserializer[topic] = StringDeserializer("utf_8")
                else:
                    schema_str = self.in_schema[topic].schema_str
                    self.deserializer[topic] = AvroDeserializer(
                        schema_str, self.schema_registry)

    def create_consumer(self):

        if self.config.get("IN_TOPIC") and self.config.get("IN_GROUP"):

            consumer_conf = {
                "bootstrap.servers": self.config["KAFKA_BROKER_URL"],
                "group.id": self.config["IN_GROUP"],
                "auto.offset.reset": "earliest",
            }

            self.consumer = Consumer(consumer_conf)
            self.consumer.subscribe(self.in_topic)

    def create_producer(self):
        if self.config.get("OUT_TOPIC"):
            producer_conf = {
                "bootstrap.servers": self.config["KAFKA_BROKER_URL"]
            }
            self.producer = Producer(producer_conf)

    def read_config(self, config_path, config_section):
        self.config = {}
        if config_path is not None and config_section is not None:
            config_section = config_section.replace(" ", "").split(",")
        else:
            raise ValueError(
                "Configuration requires config_path and config_section")
        try:
            with open(config_path, "r") as ymlfile:
                config = yaml.load(ymlfile, Loader=yaml.FullLoader)
                for section in config_section:
                    for key, value in config[section].items():
                        self.config[key] = value

        except Exception as e:
            print(f"Failed to read the config: {repr(e)}")
            sys.exit()

    def read_avro_schema(self, schema):

        with open(schema, "r") as f:
            schema_str = f.read()
        avro_schema_str = Schema(schema_str, "AVRO")

        return avro_schema_str

    def decode_msg(self, msg):

        try:
            topic = msg.topic()
            value = self.deserializer[topic](msg.value(), None)
            return value
        except Exception as e:
            print(f"Error decoding avro data: {repr(e)}")
            # sys.exit()

    def send_msg(self, message, partition=0, topic=None):

        # if no topic is provided, the first topic in the list is used as default
        if topic is None:
            out_topic = self.out_topic[0]
        else:
            out_topic = topic

        # encode the data with the specified Avro out_schema
        ctx = SerializationContext(out_topic, MessageField.VALUE)
        ser_message = self.serializer[out_topic](message, ctx)

        try:
            self.producer.produce(topic=out_topic,
                                  value=ser_message,
                                  partition=partition)
        except Exception as e:
            print(f"Error sending data to Kafka: {repr(e)}")
Exemple #6
0
class KafkaPC:
    def __init__(self, config_path, config_section):
        super(KafkaPC, self).__init__()

        self.in_topic = None
        self.out_topic = None
        self.in_schema = None
        self.out_schema = None

        self.read_config(config_path, config_section)
        self.connect_schema_registry()
        self.read_topics()
        self.create_serializer()
        self.create_deserializer()
        self.create_consumer()
        self.create_producer()

    def connect_schema_registry(self):

        if self.config.get("KAFKA_SCHEMA_REGISTRY_URL") is not None:
            sr_conf = {"url": self.config["KAFKA_SCHEMA_REGISTRY_URL"]}
            self.schema_registry = SchemaRegistryClient(sr_conf)
        else:
            raise ValueError("Need KAFKA_SCHEMA_REGISTRY_URL")

    def get_schema_from_registry(self, topic, suffix="-value"):
        response = None
        try:
            schema = self.schema_registry.get_latest_version(topic + suffix)
            response = schema.schema
        except Exception as e:
            print(f"Exception: {repr(e)}")
        return response

    def read_topics(self):

        if self.config.get("IN_TOPIC") and self.config.get("IN_GROUP"):
            self.in_topic = self.config["IN_TOPIC"]

            self.in_schema = {}
            for topic in self.in_topic:
                # try to get schema from registry
                schema = self.get_schema_from_registry(topic)
                # if no schema is found a simple string deserializer will be used, see line 87
                if schema is None:
                    self.in_schema[topic] = None
                else:
                    self.in_schema[topic] = schema

        if self.config.get("OUT_TOPIC"):
            self.out_topic = list(self.config["OUT_TOPIC"].keys())
            self.out_schema = {}
            for topic, schema in self.config["OUT_TOPIC"].items():
                self.out_schema[topic] = self.read_avro_schema(schema)

    def create_serializer(self):
        self.serializer = {}
        if self.out_topic is not None:
            for topic in self.out_topic:
                schema_str = self.out_schema[topic].schema_str
                self.serializer[topic] = AvroSerializer(
                    schema_str, self.schema_registry)

    def create_deserializer(self):
        self.deserializer = {}
        if self.in_topic is not None:
            for topic in self.in_topic:
                if self.in_schema[topic] is None:
                    self.deserializer[topic] = StringDeserializer("utf_8")
                else:
                    schema_str = self.in_schema[topic].schema_str
                    self.deserializer[topic] = AvroDeserializer(
                        schema_str, self.schema_registry)

    def create_consumer(self):

        if self.config.get("IN_TOPIC") and self.config.get("IN_GROUP"):

            consumer_conf = {
                "bootstrap.servers": self.config["KAFKA_BROKER_URL"],
                "group.id": self.config["IN_GROUP"],
                "auto.offset.reset": "earliest",
            }

            self.consumer = Consumer(consumer_conf)
            self.consumer.subscribe(self.in_topic)

    def create_producer(self):
        if self.config.get("OUT_TOPIC"):
            producer_conf = {
                "bootstrap.servers": self.config["KAFKA_BROKER_URL"]
            }
            self.producer = Producer(producer_conf)

    def read_config(self, config_path, config_section):
        self.config = {}
        if config_path is not None and config_section is not None:
            config_section = config_section.replace(" ", "").split(",")
        else:
            raise ValueError(
                "Configuration requires config_path and config_section")
        try:
            with open(config_path, "r") as ymlfile:
                config = yaml.load(ymlfile, Loader=yaml.FullLoader)
                for section in config_section:
                    for key, value in config[section].items():
                        self.config[key] = value

        except Exception as e:
            print(f"Failed to read the config: {repr(e)}")
            sys.exit()

    def read_avro_schema(self, schema):

        with open(schema, "r") as f:
            schema_str = f.read()
        avro_schema_str = Schema(schema_str, "AVRO")

        return avro_schema_str

    def decode_msg(self, msg):

        try:
            topic = msg.topic()
            value = self.deserializer[topic](msg.value(), None)
            return value
        except Exception as e:
            print(f"Error decoding avro data: {repr(e)}")
            sys.exit()

    def send_msg(self, message, partition=0, topic=None):

        # if no topic is provided, the first topic in the list is used as default
        if topic is None:
            out_topic = self.out_topic[0]
        else:
            out_topic = topic

        # encode the data with the specified Avro out_schema
        ctx = SerializationContext(out_topic, MessageField.VALUE)
        ser_message = self.serializer[out_topic](message, ctx)

        try:
            self.producer.produce(topic=out_topic,
                                  value=ser_message,
                                  partition=partition)
        except Exception as e:
            print(f"Error sending data to Kafka: {repr(e)}")
Exemple #7
0
                    nargs='?',
                    help='The name of the table to create')
parser.add_argument(
    's3_location',
    nargs='?',
    help='S3 location of your database. Example: s3://bucket/folder/')
parser.add_argument('--partition',
                    nargs='+',
                    type=str,
                    help='partitions, can be specified multiple times.',
                    default=[])

args = parser.parse_args()

schema_registry = SchemaRegistryClient({"url": args.registry_url})
avro_schema_literal = schema_registry.get_latest_version(
    f"{args.avro_subject}").schema.schema_str

athena_schema, partition_schema = create_athena_schema_from_avro(
    avro_schema_literal, args.partition)

if partition_schema:
    partition_statement = f'\nPARTITIONED BY ({partition_schema})'
else:
    partition_statement = ''

print(f'''
CREATE DATABASE IF NOT EXISTS {args.athena_database};
''')

print(f'''
CREATE EXTERNAL TABLE IF NOT EXISTS