コード例 #1
0
 def setUp(self):
     # need to set up the serializer
     # Make RecordSchema and PrimitiveSchema hashable
     schema.RecordSchema.__hash__ = self.hash_func
     schema.PrimitiveSchema.__hash__ = self.hash_func
     self.client = MockSchemaRegistryClient()
     self.ms = MessageSerializer(self.client)
コード例 #2
0
    def __init__(self,
                 config,
                 default_key_schema=None,
                 default_value_schema=None,
                 schema_registry=None,
                 subject_name_strategy=SubjectNameStrategy.RecordNameStrategy
                 ):

        sr_conf = {key.replace("schema.registry.", ""): value
                   for key, value in config.items() if key.startswith("schema.registry")}

        if sr_conf.get("basic.auth.credentials.source") == 'SASL_INHERIT':
            # Fallback to plural 'mechanisms' for backward compatibility
            sr_conf['sasl.mechanism'] = config.get('sasl.mechanism', config.get('sasl.mechanisms', ''))
            sr_conf['sasl.username'] = config.get('sasl.username', '')
            sr_conf['sasl.password'] = config.get('sasl.password', '')
            sr_conf['auto.register.schemas'] = config.get('auto.register.schemas', True)

        ap_conf = {key: value
                   for key, value in config.items() if not key.startswith("schema.registry")}

        if schema_registry is None:
            schema_registry = CachedSchemaRegistryClient(sr_conf)
        elif sr_conf.get("url", None) is not None:
            raise ValueError("Cannot pass schema_registry along with schema.registry.url config")

        super(AvroProducer, self).__init__(ap_conf)
        self._serializer = MessageSerializer(schema_registry, subject_name_strategy=subject_name_strategy)
        self._key_schema = default_key_schema
        self._value_schema = default_value_schema
コード例 #3
0
    def __init__(self,
                 config,
                 default_key_schema=None,
                 default_value_schema=None,
                 schema_registry=None):

        sr_conf = {
            key.replace("schema.registry.", ""): value
            for key, value in config.items()
            if key.startswith("schema.registry")
        }

        if sr_conf.get("basic.auth.credentials.source") == 'SASL_INHERIT':
            sr_conf['sasl.mechanisms'] = config.get('sasl.mechanisms', '')
            sr_conf['sasl.username'] = config.get('sasl.username', '')
            sr_conf['sasl.password'] = config.get('sasl.password', '')

        ap_conf = {
            key: value
            for key, value in config.items()
            if not key.startswith("schema.registry")
        }

        if schema_registry is None:
            schema_registry = CachedSchemaRegistryClient(sr_conf)
        elif sr_conf.get("url", None) is not None:
            raise ValueError(
                "Cannot pass schema_registry along with schema.registry.url config"
            )

        super(AvroProducer, self).__init__(ap_conf)
        self._serializer = MessageSerializer(schema_registry)
        self._key_schema = default_key_schema
        self._value_schema = default_value_schema
コード例 #4
0
    def __init__(self,
                 config,
                 default_key_schema=None,
                 default_value_schema=None,
                 schema_registry=None):

        schema_registry_url = config.pop("schema.registry.url", None)
        schema_registry_ca_location = config.pop(
            "schema.registry.ssl.ca.location", None)
        schema_registry_certificate_location = config.pop(
            "schema.registry.ssl.certificate.location", None)
        schema_registry_key_location = config.pop(
            "schema.registry.ssl.key.location", None)

        if schema_registry is None:
            if schema_registry_url is None:
                raise ValueError("Missing parameter: schema.registry.url")

            schema_registry = CachedSchemaRegistryClient(
                url=schema_registry_url,
                ca_location=schema_registry_ca_location,
                cert_location=schema_registry_certificate_location,
                key_location=schema_registry_key_location)
        elif schema_registry_url is not None:
            raise ValueError(
                "Cannot pass schema_registry along with schema.registry.url config"
            )

        super(AvroProducer, self).__init__(config)
        self._serializer = MessageSerializer(schema_registry)
        self._key_schema = default_key_schema
        self._value_schema = default_value_schema
コード例 #5
0
def produce(config, topic, input_messages):
    """
        produce initiate sending a message to Kafka, call the produce method passing in the input_messages key/value
        and and callback
    Parameters
    ----------
        topic: str
            topic where the input message publish too
        input_messages: dict
            a key/value input messages
        config: dict
            the config values that needed by the produce

     """
    if topic is None:
        logger.debug('Required topic field must be set')
        raise ValueError()

    if len(input_messages) <= 0:
        logger.debug('Required data field must not be empty.')
        raise ValueError()

    bootstrap_servers, schema_registry = producer_config(config)

    producer = Producer(bootstrap_servers)
    admin_client = AdminClient(bootstrap_servers)
    topics = admin_client.list_topics().topics
    #Just to show what's available
    print(topics)

    if not topics:
        print('Not Topics')
        raise RuntimeError()

    sr = CachedSchemaRegistryClient(schema_registry)
    ser = MessageSerializer(sr)
    # get schema
    id, schema, version = sr.get_latest_schema(topic + "-value")
    if schema:
        print('In If Schema')
        for key, value in input_messages.items():
            if validate_uuid4(key):
                print('In validate in For loop')
                serializedMessage = ser.encode_record_with_schema(
                    topic, schema, value)
                producer.produce(topic=topic,
                                 key=key,
                                 value=serializedMessage,
                                 callback=acked)
                # producer.flush() # bad idea, it limits throughput to the broker round trip time
                producer.poll(1)
            else:
                print('In Else of For Loop')
                logger.error('Invalid UUID String: ', key)

    else:
        print('Schema not found for topic name: ', topic)
        print('In Else Schema')
    sys.exit(1)
コード例 #6
0
class AvroProducer(Producer):
    """
        Kafka Producer client which does avro schema encoding to messages.
        Handles schema registration, Message serialization.

        Constructor takes below parameters

        @:param: config: dict object with config parameters containing url for schema registry (schema.registry.url).
        @:param: default_key_schema: Optional avro schema for key
        @:param: default_value_schema: Optional avro schema for value
    """
    def __init__(self,
                 config,
                 default_key_schema=None,
                 default_value_schema=None):
        if ('schema.registry.url' not in config.keys()):
            raise ValueError("Missing parameter: schema.registry.url")
        schem_registry_url = config["schema.registry.url"]
        del config["schema.registry.url"]

        super(AvroProducer, self).__init__(config)
        self._serializer = MessageSerializer(
            CachedSchemaRegistryClient(url=schem_registry_url))
        self._key_schema = default_key_schema
        self._value_schema = default_value_schema

    def produce(self, **kwargs):
        """
            Sends message to kafka by encoding with specified avro schema
            @:param: topic: topic name
            @:param: value: A dictionary object
            @:param: value_schema : Avro schema for value
            @:param: key: A dictionary object
            @:param: key_schema : Avro schema for key
            @:exception: SerializerError
        """
        # get schemas from  kwargs if defined
        key_schema = kwargs.pop('key_schema', self._key_schema)
        value_schema = kwargs.pop('value_schema', self._value_schema)
        topic = kwargs.pop('topic', None)
        if not topic:
            raise ClientError("Topic name not specified.")
        value = kwargs.pop('value', None)
        key = kwargs.pop('key', None)
        if value:
            if value_schema:
                value = self._serializer.encode_record_with_schema(
                    topic, value_schema, value)
            else:
                raise SerializerError("Avro schema required for value")

        if key:
            if key_schema:
                key = self._serializer.encode_record_with_schema(
                    topic, key_schema, key, True)
            else:
                raise SerializerError("Avro schema required for key")

        super(AvroProducer, self).produce(topic, value, key, **kwargs)
コード例 #7
0
ファイル: deactivate_py.py プロジェクト: cm-rennie/scripts
class AvroProducer(object):
    def __init__(self,
                 config,
                 default_key_schema=None,
                 default_value_schema=None,
                 schema_registry=None):
        schema_registry_url = config.pop("schema.registry.url", None)
        schema_registry_ca_location = config.pop(
            "schema.registry.ssl.ca.location", None)
        schema_registry_certificate_location = config.pop(
            "schema.registry.ssl.certificate.location", None)
        schema_registry_key_location = config.pop(
            "schema.registry.ssl.key.location", None)

        if schema_registry is None:
            if schema_registry_url is None:
                raise ValueError("Missing parameter: schema.registry.url")

            schema_registry = CachedSchemaRegistryClient(
                url=schema_registry_url,
                ca_location=schema_registry_ca_location,
                cert_location=schema_registry_certificate_location,
                key_location=schema_registry_key_location)
        elif schema_registry_url is not None:
            raise ValueError(
                "Cannot pass schema_registry along with schema.registry.url config"
            )

        self.producer = Producer(config)
        self._serializer = MessageSerializer(schema_registry)
        self._key_schema = default_key_schema
        self._value_schema = default_value_schema

    def flush(self):
        self.producer.flush()

    def produce(self, **kwargs):
        key_schema = kwargs.pop('key_schema', self._key_schema)
        value_schema = kwargs.pop('value_schema', self._value_schema)
        topic = kwargs.pop('topic', None)
        if not topic:
            raise ClientError("Topic name not specified.")
        value = kwargs.pop('value', None)
        key = kwargs.pop('key', None)

        if value is not None:
            if value_schema:
                value = self._serializer.encode_record_with_schema(
                    topic, value_schema, value)
            else:
                raise ValueSerializerError("Avro schema required for values")

        if key is not None:
            if key_schema:
                key = self._serializer.encode_record_with_schema(
                    topic, key_schema, key, True)

        self.producer.produce(topic, value, key, **kwargs)
コード例 #8
0
class AvroConsumer(Consumer):
    """
    Kafka Consumer client which does avro schema decoding of messages.
    Handles message deserialization.

    Constructor takes below parameters

    :param dict config: Config parameters containing url for schema registry (``schema.registry.url``)
                        and the standard Kafka client configuration (``bootstrap.servers`` et.al).
    """
    def __init__(self, config, schema_registry=None):

        schema_registry_url = config.pop("schema.registry.url", None)
        schema_registry_ca_location = config.pop("schema.registry.ssl.ca.location", None)
        schema_registry_certificate_location = config.pop("schema.registry.ssl.certificate.location", None)
        schema_registry_key_location = config.pop("schema.registry.ssl.key.location", None)

        if schema_registry is None:
            if schema_registry_url is None:
                raise ValueError("Missing parameter: schema.registry.url")

            schema_registry = CachedSchemaRegistryClient(url=schema_registry_url,
                                                         ca_location=schema_registry_ca_location,
                                                         cert_location=schema_registry_certificate_location,
                                                         key_location=schema_registry_key_location)
        elif schema_registry_url is not None:
            raise ValueError("Cannot pass schema_registry along with schema.registry.url config")

        super(AvroConsumer, self).__init__(config)
        self._serializer = MessageSerializer(schema_registry)

    def poll(self, timeout=None, with_schema=False):
        """
        This is an overriden method from confluent_kafka.Consumer class. This handles message
        deserialization using avro schema

        :param float timeout: Poll timeout in seconds (default: indefinite)
        :param boolean with_schema: If true, the key_schema and value_schema are added as properties of the message
                                    (default: False)
        :returns: message object with deserialized key and value as dict objects
        :rtype: Message or AvroMessage
        """
        if timeout is None:
            timeout = -1
        message = super(AvroConsumer, self).poll(timeout)
        key_schema = value_schema = None
        if message is None:
            return None
        if not message.value() and not message.key():
            return message
        if not message.error():
            if message.value() is not None:
                decoded_value, value_schema = self._serializer.decode_message(message.value())
                message.set_value(decoded_value)
            if message.key() is not None:
                decoded_key, key_schema = self._serializer.decode_message(message.key())
                message.set_key(decoded_key)
        return message if not with_schema else AvroMessage(key_schema, value_schema, message)
コード例 #9
0
    def __init__(self, config):

        if ('schema.registry.url' not in config.keys()):
            raise ValueError("Missing parameter: schema.registry.url")
        schem_registry_url = config["schema.registry.url"]
        del config["schema.registry.url"]

        super(AvroConsumer, self).__init__(config)
        self._serializer = MessageSerializer(CachedSchemaRegistryClient(url=schem_registry_url))
コード例 #10
0
    def __init__(self, config, default_key_schema=None,
                 default_value_schema=None):
        if ('schema.registry.url' not in config.keys()):
            raise ValueError("Missing parameter: schema.registry.url")
        schem_registry_url = config["schema.registry.url"]
        del config["schema.registry.url"]

        super(AvroProducer, self).__init__(config)
        self._serializer = MessageSerializer(CachedSchemaRegistryClient(url=schem_registry_url))
        self._key_schema = default_key_schema
        self._value_schema = default_value_schema
コード例 #11
0
class TestMessageSerializer(unittest.TestCase):
    def setUp(self):
        # need to set up the serializer
        self.client = MockSchemaRegistryClient()
        self.ms = MessageSerializer(self.client)

    def assertMessageIsSame(self, message, expected, schema_id, schema):
        self.assertTrue(message)
        self.assertTrue(len(message) > 5)
        magic, sid = struct.unpack('>bI', message[0:5])
        self.assertEqual(magic, 0)
        self.assertEqual(sid, schema_id)
        decoded_msg, decoded_schema = self.ms.decode_message(message)
        self.assertTrue(decoded_msg)
        self.assertEqual(decoded_msg, expected)
        self.assertEqual(decoded_schema, schema)

    def test_encode_with_schema_id(self):
        adv = avro.loads(data_gen.ADVANCED_SCHEMA)
        basic = avro.loads(data_gen.BASIC_SCHEMA)
        subject = 'test'
        schema_id = self.client.register(subject, basic)

        records = data_gen.BASIC_ITEMS
        for record in records:
            message = self.ms.encode_record_with_schema_id(schema_id, record)
            self.assertMessageIsSame(message, record, schema_id, basic)

        subject = 'test_adv'
        adv_schema_id = self.client.register(subject, adv)
        self.assertNotEqual(adv_schema_id, schema_id)
        records = data_gen.ADVANCED_ITEMS
        for record in records:
            message = self.ms.encode_record_with_schema_id(
                adv_schema_id, record)
            self.assertMessageIsSame(message, record, adv_schema_id, adv)

    def test_encode_record_with_schema(self):
        topic = 'test'
        basic = avro.loads(data_gen.BASIC_SCHEMA)
        subject = 'test-value'
        schema_id = self.client.register(subject, basic)
        records = data_gen.BASIC_ITEMS
        for record in records:
            message = self.ms.encode_record_with_schema(topic, basic, record)
            self.assertMessageIsSame(message, record, schema_id, basic)

    def test_decode_none(self):
        """"null/None messages should decode to None"""

        self.assertIsNone(self.ms.decode_message(None))

    def hash_func(self):
        return hash(str(self))
コード例 #12
0
ファイル: kafkadriver.py プロジェクト: jinnymus/Python
    def read_from_offset(self, offset=0, lang='json', schema=None):

        '''

        Kafka read message

        Read json and avro messages from consumer

        '''
        log.debug("[KafkaDriver][read_from_offset] lang: " + str(lang))
        log.debug("[KafkaDriver][read_from_offset] offset: " + str(offset))

        def outputJSON(obj):

            '''

            Default JSON serializer.

            '''

            if isinstance(obj, datetime.datetime):
                return int(obj.strftime("%s%f")[:-3])
            return obj


        ret = None
        log.debug("[KafkaDriver][read_from_offset] read start: " + str(self.server))
        consumer = KafkaConsumer(bootstrap_servers=self.server + ':9092',
                                 auto_offset_reset='earliest',
                                 consumer_timeout_ms=1000)

        partition = TopicPartition(self.topic, 0)
        consumer.assign([partition])
        consumer.seek_to_end(partition)
        start = int(offset)
        consumer.seek(partition, offset)

        for msg in consumer:
            if (lang == 'avro'):
                #message = AvroDecoder.decode(schema, msg.value)
                schema_registry = CachedSchemaRegistryClient(url='http://' + self.schema_registry + ':8081')
                self._serializer = MessageSerializer(schema_registry)
                message = self._serializer.decode_message(msg.value)
                message = json.dumps(message, indent=4, sort_keys=True, default=outputJSON)
                #log.debug("[KafkaDriver][read_from_offset] avro message: " + str(message))
                ret = message
            else:
                message = msg.value
                #log.debug("[KafkaDriver][read_from_offset] other message: " + str(message))
                ret = msg.value
            log.debug("[KafkaDriver][read_from_offset] msg: " + str(message) + " msg.offset: " + str(msg.offset))
        consumer.close()
        log.debug("[KafkaDriver][read_from_offset] read end")
        return ret
コード例 #13
0
 def __init__(self, schema_registry_url):
     """Private implementation class for Avro IO using the registry"""
     log.info(
         f"Using registry with schema_url/id {schema_registry_url}/{config.SCHEMA_ID}"
     )
     try:
         self.client = CachedSchemaRegistryClient(url=schema_registry_url)
         self.schema = self.client.get_by_id(config.SCHEMA_ID)
         self.serializer = MessageSerializer(self.client)
     except:
         raise ValueError("Client id or schema id not found")
コード例 #14
0
 def __init__(self,
              producer,
              schema_registry_url,
              default_key_schema=None,
              default_value_schema=None
              ):  # real signature unknown; restored from __doc__
     self._producer = producer
     self._serializer = MessageSerializer(
         CachedSchemaRegistryClient(url=schema_registry_url))
     self.key_schema = default_key_schema
     self.value_schema = default_value_schema
コード例 #15
0
def consume(config, topic, handler):
    """
    Starts a consumer and calls the given handler for each consumed message.
    Assumes that keys are serialized as strings and values are serialized
    as Avro objects with their schemas stored in a Confluent Schema Registry.
    """
    c_conf = {}
    for key, value in config.items():
        if not key.startswith("schema.registry"):
            if not value is None:
                c_conf[key] = value.strip()

    if "auto.offset.reset" in c_conf:
        print("offset provided")
    else:
        c_conf['auto.offset.reset'] = 'earliest'

    if "group.id" in c_conf:
        print("group id provided")
    else:
        c_conf['group.id'] = 'sme_test'

    c = Consumer(c_conf)
    c.subscribe([topic])

    sr_conf = {
        key.replace("schema.registry.", ""): value.strip()
        for key, value in config.items() if key.startswith("schema.registry")
    }

    sr = CachedSchemaRegistryClient(sr_conf)
    ser = MessageSerializer(sr)

    while True:
        try:
            msg = c.poll(10)
            if msg is None:
                print('No Messages')
                continue
            if msg.error():
                log.error("Consumer error: {}".format(msg.error()))
                continue
            key = msg.key().decode('utf-8')
            value = ser.decode_message(msg.value(), is_key=False)
        except Exception as e:
            log.error("Message consumption failed: {}".format(e))
            break
        try:
            handler(key, value)
        except Exception as e:
            log.error("Message handler failed: {}".format(e))
            break
    c.close()
コード例 #16
0
class TestMessageSerializer(unittest.TestCase):
    def setUp(self):
        # need to set up the serializer
        self.client = MockSchemaRegistryClient()
        self.ms = MessageSerializer(self.client)

    def assertMessageIsSame(self, message, expected, schema_id):
        self.assertTrue(message)
        self.assertTrue(len(message) > 5)
        magic, sid = struct.unpack('>bI', message[0:5])
        self.assertEqual(magic, 0)
        self.assertEqual(sid, schema_id)
        decoded = self.ms.decode_message(message)
        self.assertTrue(decoded)
        self.assertEqual(decoded, expected)

    def test_encode_with_schema_id(self):
        adv = avro.loads(data_gen.ADVANCED_SCHEMA)
        basic = avro.loads(data_gen.BASIC_SCHEMA)
        subject = 'test'
        schema_id = self.client.register(subject, basic)

        records = data_gen.BASIC_ITEMS
        for record in records:
            message = self.ms.encode_record_with_schema_id(schema_id, record)
            self.assertMessageIsSame(message, record, schema_id)

        subject = 'test_adv'
        adv_schema_id = self.client.register(subject, adv)
        self.assertNotEqual(adv_schema_id, schema_id)
        records = data_gen.ADVANCED_ITEMS
        for record in records:
            message = self.ms.encode_record_with_schema_id(adv_schema_id, record)
            self.assertMessageIsSame(message, record, adv_schema_id)

    def test_encode_record_with_schema(self):
        topic = 'test'
        basic = avro.loads(data_gen.BASIC_SCHEMA)
        subject = 'test-value'
        schema_id = self.client.register(subject, basic)
        records = data_gen.BASIC_ITEMS
        for record in records:
            message = self.ms.encode_record_with_schema(topic, basic, record)
            self.assertMessageIsSame(message, record, schema_id)

    def test_decode_none(self):
        """"null/None messages should decode to None"""

        self.assertIsNone(self.ms.decode_message(None))

    def hash_func(self):
        return hash(str(self))
コード例 #17
0
class TestMessageSerializer(unittest.TestCase):
    def setUp(self):
        # need to set up the serializer
        # Make RecordSchema and PrimitiveSchema hashable
        schema.RecordSchema.__hash__ = self.hash_func
        schema.PrimitiveSchema.__hash__ = self.hash_func
        self.client = MockSchemaRegistryClient()
        self.ms = MessageSerializer(self.client)

    def assertMessageIsSame(self, message, expected, schema_id):
        self.assertTrue(message)
        self.assertTrue(len(message) > 5)
        magic, sid = struct.unpack('>bI', message[0:5])
        self.assertEqual(magic, 0)
        self.assertEqual(sid, schema_id)
        decoded = self.ms.decode_message(message)
        self.assertTrue(decoded)
        self.assertEqual(decoded, expected)

    def test_encode_with_schema_id(self):
        adv = util.parse_schema_from_string(data_gen.ADVANCED_SCHEMA)
        basic = util.parse_schema_from_string(data_gen.BASIC_SCHEMA)
        subject = 'test'
        schema_id = self.client.register(subject, basic)

        records = data_gen.BASIC_ITEMS
        for record in records:
            message = self.ms.encode_record_with_schema_id(schema_id, record)
            self.assertMessageIsSame(message, record, schema_id)

        subject = 'test_adv'
        adv_schema_id = self.client.register(subject, adv)
        self.assertNotEqual(adv_schema_id, schema_id)
        records = data_gen.ADVANCED_ITEMS
        for record in records:
            message = self.ms.encode_record_with_schema_id(
                adv_schema_id, record)
            self.assertMessageIsSame(message, record, adv_schema_id)

    def test_encode_record_with_schema(self):
        topic = 'test'
        basic = util.parse_schema_from_string(data_gen.BASIC_SCHEMA)
        subject = 'test-value'
        schema_id = self.client.register(subject, basic)
        records = data_gen.BASIC_ITEMS
        for record in records:
            message = self.ms.encode_record_with_schema(topic, basic, record)
            self.assertMessageIsSame(message, record, schema_id)

    def hash_func(self):
        return hash(str(self))
コード例 #18
0
    def __init__(self, config, schema_registry=None):
        schema_registry_url = config.pop("schema.registry.url", None)
        if schema_registry is None:
            if schema_registry_url is None:
                raise ValueError("Missing parameter: schema.registry.url")
            schema_registry = CachedSchemaRegistryClient(
                url=schema_registry_url)
        elif schema_registry_url is not None:
            raise ValueError(
                "Cannot pass schema_registry along with schema.registry.url config"
            )

        super(AvroConsumer, self).__init__(config)
        self._serializer = MessageSerializer(schema_registry)
コード例 #19
0
class AvroConsumer(Consumer):
    """
    Kafka Consumer client which does avro schema decoding of messages.
    Handles message deserialization.

    Constructor takes below parameters

    @:param: config: dict object with config parameters containing url for schema registry (schema.registry.url).
    """
    def __init__(self, config, schema_registry=None):
        self._enable_key_decoding = config.pop("enable.key.decoding", True)

        schema_registry_url = config.pop("schema.registry.url", None)
        if schema_registry is None:
            if schema_registry_url is None:
                raise ValueError("Missing parameter: schema.registry.url")
            schema_registry = CachedSchemaRegistryClient(
                url=schema_registry_url)
        elif schema_registry_url is not None:
            raise ValueError(
                "Cannot pass schema_registry along with schema.registry.url config"
            )

        super(AvroConsumer, self).__init__(config)
        self._serializer = MessageSerializer(schema_registry)

    def poll(self, timeout=None):
        """
        This is an overriden method from confluent_kafka.Consumer class. This handles message
        deserialization using avro schema

        @:param timeout
        @:return message object with deserialized key and value as dict objects
        """
        if timeout is None:
            timeout = -1
        message = super(AvroConsumer, self).poll(timeout)
        if message is None:
            return None
        if not message.value() and not message.key():
            return message
        if not message.error():
            if message.value() is not None:
                decoded_value = self._serializer.decode_message(
                    message.value())
                message.set_value(decoded_value)
            if self._enable_key_decoding and message.key() is not None:
                decoded_key = self._serializer.decode_message(message.key())
                message.set_key(decoded_key)
        return message
コード例 #20
0
class SimpleAvroDeserializer(Deserializer):
    def __init__(self, schema_registry_url):
        schema_registry = CachedSchemaRegistryClient(
            {'url': schema_registry_url})
        self._serializer = MessageSerializer(schema_registry, None, None)

    def __call__(self, value, ctx=None):
        if value is None:
            return None

        if ctx is not None and ctx.field == 'key':
            decoded = self._serializer.decode_message(value, is_key=True)
        else:
            decoded = self._serializer.decode_message(value, is_key=False)

        return decoded
コード例 #21
0
    def printAndProduceMessages(self):
        consumer = AvroConsumer({
            'bootstrap.servers': BOOTSTRAP_SERVERS,
            'group.id': GROUP_ID,
            'auto.offset.reset': AUTO_OFFSET_RESET,
            'enable.auto.commit': False,
            'schema.registry.url': SCHEMA_REGISTRY_URL
        })
        schema_registry = CachedSchemaRegistryClient(
            os.environ.get('SCHEMA_REGISTRY', SCHEMA_REGISTRY_URL))
        avro_serde = AvroSerde(schema_registry)

        consumer.subscribe([INPUT_TOPIC_NAME])

        while True:
            try:
                consumedMessages = consumer.consume(
                    num_messages=CONSUMER_BATCH_SIZE, timeout=1)
            except Exception as e:
                logging.error("Message pool failed: {}".format(e))
                break

            messages = []
            for consumedMessage in consumedMessages:
                consumedMessageValue = avro_serde.decode_message(
                    consumedMessage.value())
                message = {}
                message["key"] = {}
                message["value"] = {}

                for attr, value in consumedMessageValue.items():
                    if attr != ARRAY_NAME:
                        message["value"][DOCUMENT_FIELD_PREFIX + attr] = value

                for arrayItem in consumedMessageValue[ARRAY_NAME]:
                    message["key"]["id"] = consumedMessageValue["id"] + \
                        "-" + arrayItem["id"]
                    for attr, value in arrayItem.items():
                        message["value"][attr] = value
                    messages.append(message)

            self.produceMessages(messages)
            consumer.commit()
        consumer.close()
コード例 #22
0
class _AvroIORegistry:
    def __init__(self, schema_registry_url):
        """Private implementation class for Avro IO using the registry"""
        log.info(
            f"Using registry with schema_url/id {schema_registry_url}/{config.SCHEMA_ID}"
        )
        try:
            self.client = CachedSchemaRegistryClient(url=schema_registry_url)
            self.schema = self.client.get_by_id(config.SCHEMA_ID)
            self.serializer = MessageSerializer(self.client)
        except:
            raise ValueError("Client id or schema id not found")

    def decode(self, bytes):
        return self.serializer.decode_message(bytes)

    def encode(self, record):
        return self.serializer.encode_record_with_schema_id(
            config.SCHEMA_ID, record)
コード例 #23
0
    def __init__(self, config, schema_registry=None, reader_key_schema=None, reader_value_schema=None):

        sr_conf = {key.replace("schema.registry.", ""): value
                   for key, value in config.items() if key.startswith("schema.registry")}

        if sr_conf.get("basic.auth.credentials.source") == 'SASL_INHERIT':
            # Fallback to plural 'mechanisms' for backward compatibility
            sr_conf['sasl.mechanism'] = config.get('sasl.mechanism', config.get('sasl.mechanisms', ''))
            sr_conf['sasl.username'] = config.get('sasl.username', '')
            sr_conf['sasl.password'] = config.get('sasl.password', '')

        ap_conf = {key: value
                   for key, value in config.items() if not key.startswith("schema.registry")}

        if schema_registry is None:
            schema_registry = CachedSchemaRegistryClient(sr_conf)
        elif sr_conf.get("url", None) is not None:
            raise ValueError("Cannot pass schema_registry along with schema.registry.url config")

        super(AvroConsumer, self).__init__(ap_conf)
        self._serializer = MessageSerializer(schema_registry, reader_key_schema, reader_value_schema)
コード例 #24
0
def test_select(started_cluster):
    # type: (ClickHouseCluster) -> None

    schema_registry_client = CachedSchemaRegistryClient(
        'http://localhost:{}'.format(started_cluster.schema_registry_port))
    serializer = MessageSerializer(schema_registry_client)

    schema = avro.schema.make_avsc_object({
        'name':
        'test_record',
        'type':
        'record',
        'fields': [{
            'name': 'value',
            'type': 'long'
        }]
    })

    buf = io.BytesIO()
    for x in range(0, 3):
        message = serializer.encode_record_with_schema('test_subject', schema,
                                                       {'value': x})
        buf.write(message)
    data = buf.getvalue()

    instance = started_cluster.instances["dummy"]  # type: ClickHouseInstance
    schema_registry_url = "http://{}:{}".format(
        started_cluster.schema_registry_host, 8081)

    run_query(instance,
              "create table avro_data(value Int64) engine = Memory()")
    settings = {'format_avro_schema_registry_url': schema_registry_url}
    run_query(instance, "insert into avro_data format AvroConfluent", data,
              settings)
    stdout = run_query(instance, "select * from avro_data")
    assert list(map(str.split, stdout.splitlines())) == [
        ["0"],
        ["1"],
        ["2"],
    ]
コード例 #25
0
class AvroConsumer(Consumer):
    """
    Kafka Consumer client which does avro schema decoding of messages.
    Handles message deserialization.

    Constructor takes below parameters

    @:param: config: dict object with config parameters containing url for schema registry (schema.registry.url).
    """
    def __init__(self, config):

        if ('schema.registry.url' not in config.keys()):
            raise ValueError("Missing parameter: schema.registry.url")
        schem_registry_url = config["schema.registry.url"]
        del config["schema.registry.url"]

        super(AvroConsumer, self).__init__(config)
        self._serializer = MessageSerializer(CachedSchemaRegistryClient(url=schem_registry_url))

    def poll(self, timeout):
        """
        This is an overriden method from confluent_kafka.Consumer class. This handles message
        deserialization using avro schema

        @:param timeout
        @:return message object with deserialized key and value as dict objects
        """
        message = super(AvroConsumer, self).poll(timeout)
        if not message:
            return message
        if not message.error():
            if message.value() is not None:
                decoded_value = self._serializer.decode_message(message.value())
                message.set_value(decoded_value)
            if message.key() is not None:
                decoded_key = self._serializer.decode_message(message.key())
                message.set_key(decoded_key)
        return message
コード例 #26
0
    def __init__(self,
                 config,
                 default_key_schema=None,
                 default_value_schema=None,
                 schema_registry=None):
        schema_registry_url = config.pop("schema.registry.url", None)
        if schema_registry is None:
            if schema_registry_url is None:
                raise ValueError("Missing parameter: schema.registry.url")
            schema_registry = CachedSchemaRegistryClient(
                url=schema_registry_url)
        elif schema_registry_url is not None:
            raise ValueError(
                "Cannot pass schema_registry along with schema.registry.url config"
            )

        super(AvroProducer, self).__init__(config)
        self._key_serializer = config.pop("key.serializer",
                                          MessageSerializer(schema_registry))
        self._value_serializer = config.pop("key.serializer",
                                            MessageSerializer(schema_registry))
        self._key_schema = default_key_schema
        self._value_schema = default_value_schema
コード例 #27
0
    def __init__(self, config, schema_registry=None, reader_key_schema=None, reader_value_schema=None):

        sr_conf = {key.replace("schema.registry.", ""): value
                   for key, value in config.items() if key.startswith("schema.registry")}

        if sr_conf.get("basic.auth.credentials.source") == 'SASL_INHERIT':
            sr_conf['sasl.mechanisms'] = config.get('sasl.mechanisms', '')
            sr_conf['sasl.username'] = config.get('sasl.username', '')
            sr_conf['sasl.password'] = config.get('sasl.password', '')

        ap_conf = {key: value
                   for key, value in config.items() if not key.startswith("schema.registry")}

        if schema_registry is None:
            schema_registry = CachedSchemaRegistryClient(sr_conf)
        elif sr_conf.get("url", None) is not None:
            raise ValueError("Cannot pass schema_registry along with schema.registry.url config")

        super(AvroConsumer, self).__init__(ap_conf)
        self._serializer = MessageSerializer(schema_registry, reader_key_schema, reader_value_schema)
コード例 #28
0
class AvroProducer(Producer):
    """
        Kafka Producer client which does avro schema encoding to messages.
        Handles schema registration, Message serialization.

        Constructor takes below parameters.

        :param dict config: Config parameters containing url for schema registry (``schema.registry.url``)
                            and the standard Kafka client configuration (``bootstrap.servers`` et.al).
        :param str default_key_schema: Optional default avro schema for key
        :param str default_value_schema: Optional default avro schema for value
    """
    def __init__(self,
                 config,
                 default_key_schema=None,
                 default_value_schema=None,
                 schema_registry=None):

        schema_registry_url = config.pop("schema.registry.url", None)
        schema_registry_ca_location = config.pop(
            "schema.registry.ssl.ca.location", None)
        schema_registry_certificate_location = config.pop(
            "schema.registry.ssl.certificate.location", None)
        schema_registry_key_location = config.pop(
            "schema.registry.ssl.key.location", None)

        if schema_registry is None:
            if schema_registry_url is None:
                raise ValueError("Missing parameter: schema.registry.url")

            schema_registry = CachedSchemaRegistryClient(
                url=schema_registry_url,
                ca_location=schema_registry_ca_location,
                cert_location=schema_registry_certificate_location,
                key_location=schema_registry_key_location)
        elif schema_registry_url is not None:
            raise ValueError(
                "Cannot pass schema_registry along with schema.registry.url config"
            )

        super(AvroProducer, self).__init__(config)
        self._serializer = MessageSerializer(schema_registry)
        self._key_schema = default_key_schema
        self._value_schema = default_value_schema

    def produce(self, **kwargs):
        """
            Asynchronously sends message to Kafka by encoding with specified or default avro schema.

            :param str topic: topic name
            :param object value: An object to serialize
            :param str value_schema: Avro schema for value
            :param object key: An object to serialize
            :param str key_schema: Avro schema for key

            Plus any other parameters accepted by confluent_kafka.Producer.produce

            :raises SerializerError: On serialization failure
            :raises BufferError: If producer queue is full.
            :raises KafkaException: For other produce failures.
        """
        # get schemas from  kwargs if defined
        key_schema = kwargs.pop('key_schema', self._key_schema)
        value_schema = kwargs.pop('value_schema', self._value_schema)
        topic = kwargs.pop('topic', None)
        if not topic:
            raise ClientError("Topic name not specified.")
        value = kwargs.pop('value', None)
        key = kwargs.pop('key', None)

        if value is not None:
            if value_schema:
                value = self._serializer.encode_record_with_schema(
                    topic, value_schema, value)
            else:
                raise ValueSerializerError("Avro schema required for values")

        if key is not None:
            if key_schema:
                key = self._serializer.encode_record_with_schema(
                    topic, key_schema, key, True)
            else:
                raise KeySerializerError("Avro schema required for key")

        super(AvroProducer, self).produce(topic, value, key, **kwargs)
コード例 #29
0
 def setUp(self):
     # need to set up the serializer
     self.client = MockSchemaRegistryClient()
     self.ms = MessageSerializer(self.client)
コード例 #30
0
class TestMessageSerializer(unittest.TestCase):
    def setUp(self):
        # need to set up the serializer
        self.client = MockSchemaRegistryClient()
        self.ms = MessageSerializer(self.client)

    def assertMessageIsSame(self, message, expected, schema_id):
        self.assertTrue(message)
        self.assertTrue(len(message) > 5)
        magic, sid = struct.unpack('>bI', message[0:5])
        self.assertEqual(magic, 0)
        self.assertEqual(sid, schema_id)
        decoded = self.ms.decode_message(message)
        self.assertTrue(decoded)
        self.assertEqual(decoded, expected)

    def test_encode_with_schema_id(self):
        adv = avro.loads(data_gen.ADVANCED_SCHEMA)
        basic = avro.loads(data_gen.BASIC_SCHEMA)
        subject = 'test'
        schema_id = self.client.register(subject, basic)

        records = data_gen.BASIC_ITEMS
        for record in records:
            message = self.ms.encode_record_with_schema_id(schema_id, record)
            self.assertMessageIsSame(message, record, schema_id)

        subject = 'test_adv'
        adv_schema_id = self.client.register(subject, adv)
        self.assertNotEqual(adv_schema_id, schema_id)
        records = data_gen.ADVANCED_ITEMS
        for record in records:
            message = self.ms.encode_record_with_schema_id(
                adv_schema_id, record)
            self.assertMessageIsSame(message, record, adv_schema_id)

    def test_encode_record_with_schema(self):
        topic = 'test'
        basic = avro.loads(data_gen.BASIC_SCHEMA)
        subject = 'test-value'
        schema_id = self.client.register(subject, basic)
        records = data_gen.BASIC_ITEMS
        for record in records:
            message = self.ms.encode_record_with_schema(topic, basic, record)
            self.assertMessageIsSame(message, record, schema_id)

    def test_decode_none(self):
        """"null/None messages should decode to None"""
        self.assertIsNone(self.ms.decode_message(None))

    def test_decode_with_schema(self):
        topic = 'test_specific'

        schema_v1 = avro.loads(
            data_gen.load_schema_file('evolution_schema_v1.avsc'))
        schema_v2 = avro.loads(
            data_gen.load_schema_file('evolution_schema_v2.avsc'))

        dsv1 = SpecificRecordMessageDeserializer(self.client,
                                                 value_schema=schema_v1)
        dsv2 = SpecificRecordMessageDeserializer(self.client,
                                                 value_schema=schema_v2)

        record_v1 = {"name": "suzyq", "age": 27}
        record_v2 = dict(record_v1)
        record_v2['gender'] = 'NONE'

        encoded_v1 = self.ms.encode_record_with_schema(topic, schema_v1,
                                                       record_v1)
        decoded_v1_v1 = dsv1.decode_message(encoded_v1, is_key=False)
        self.assertDictEqual(record_v1, decoded_v1_v1)
        decoded_v1_v2 = dsv2.decode_message(encoded_v1, is_key=False)
        self.assertDictEqual(record_v2, decoded_v1_v2)

        encoded_v2 = self.ms.encode_record_with_schema(topic, schema_v2,
                                                       record_v2)
        decoded_v2_v2 = dsv2.decode_message(encoded_v2, is_key=False)
        self.assertDictEqual(record_v2, decoded_v2_v2)
        decoded_v2_v1 = dsv1.decode_message(encoded_v2, is_key=False)
        self.assertDictEqual(record_v1, decoded_v2_v1)

    def hash_func(self):
        return hash(str(self))
コード例 #31
0
 def setUp(self):
     # need to set up the serializer
     self.client = MockSchemaRegistryClient()
     self.ms = MessageSerializer(self.client)
コード例 #32
0
class AvroConsumer(Consumer):
    """
    Kafka Consumer client which does avro schema decoding of messages.
    Handles message deserialization.

    Constructor takes below parameters

    :param dict config: Config parameters containing url for schema registry (``schema.registry.url``)
                        and the standard Kafka client configuration (``bootstrap.servers`` et.al)
    :param schema reader_key_schema: a reader schema for the message key
    :param schema reader_value_schema: a reader schema for the message value
    :raises ValueError: For invalid configurations
    """

    def __init__(self, config, schema_registry=None, reader_key_schema=None, reader_value_schema=None):

        sr_conf = {key.replace("schema.registry.", ""): value
                   for key, value in config.items() if key.startswith("schema.registry")}

        if sr_conf.get("basic.auth.credentials.source") == 'SASL_INHERIT':
            sr_conf['sasl.mechanisms'] = config.get('sasl.mechanisms', '')
            sr_conf['sasl.username'] = config.get('sasl.username', '')
            sr_conf['sasl.password'] = config.get('sasl.password', '')

        ap_conf = {key: value
                   for key, value in config.items() if not key.startswith("schema.registry")}

        if schema_registry is None:
            schema_registry = CachedSchemaRegistryClient(sr_conf)
        elif sr_conf.get("url", None) is not None:
            raise ValueError("Cannot pass schema_registry along with schema.registry.url config")

        super(AvroConsumer, self).__init__(ap_conf)
        self._serializer = MessageSerializer(schema_registry, reader_key_schema, reader_value_schema)

    def poll(self, timeout=None):
        """
        This is an overriden method from confluent_kafka.Consumer class. This handles message
        deserialization using avro schema

        :param float timeout: Poll timeout in seconds (default: indefinite)
        :returns: message object with deserialized key and value as dict objects
        :rtype: Message
        """
        if timeout is None:
            timeout = -1
        message = super(AvroConsumer, self).poll(timeout)
        if message is None:
            return None

        if not message.error():
            try:
                if message.value() is not None:
                    decoded_value = self._serializer.decode_message(message.value(), is_key=False)
                    message.set_value(decoded_value)
                if message.key() is not None:
                    decoded_key = self._serializer.decode_message(message.key(), is_key=True)
                    message.set_key(decoded_key)
            except SerializerError as e:
                raise SerializerError("Message deserialization failed for message at {} [{}] offset {}: {}".format(
                    message.topic(),
                    message.partition(),
                    message.offset(),
                    e))
        return message
コード例 #33
0
class AvroProducer(Producer):
    """
        Kafka Producer client which does avro schema encoding to messages.
        Handles schema registration, Message serialization.

        Constructor takes below parameters.

        :param dict config: Config parameters containing url for schema registry (``schema.registry.url``)
                            and the standard Kafka client configuration (``bootstrap.servers`` et.al).
        :param str default_key_schema: Optional default avro schema for key
        :param str default_value_schema: Optional default avro schema for value
    """

    def __init__(self, config, default_key_schema=None,
                 default_value_schema=None, schema_registry=None):

        sr_conf = {key.replace("schema.registry.", ""): value
                   for key, value in config.items() if key.startswith("schema.registry")}

        if sr_conf.get("basic.auth.credentials.source") == 'SASL_INHERIT':
            sr_conf['sasl.mechanisms'] = config.get('sasl.mechanisms', '')
            sr_conf['sasl.username'] = config.get('sasl.username', '')
            sr_conf['sasl.password'] = config.get('sasl.password', '')

        ap_conf = {key: value
                   for key, value in config.items() if not key.startswith("schema.registry")}

        if schema_registry is None:
            schema_registry = CachedSchemaRegistryClient(sr_conf)
        elif sr_conf.get("url", None) is not None:
            raise ValueError("Cannot pass schema_registry along with schema.registry.url config")

        super(AvroProducer, self).__init__(ap_conf)
        self._serializer = MessageSerializer(schema_registry)
        self._key_schema = default_key_schema
        self._value_schema = default_value_schema

    def produce(self, **kwargs):
        """
            Asynchronously sends message to Kafka by encoding with specified or default avro schema.

            :param str topic: topic name
            :param object value: An object to serialize
            :param str value_schema: Avro schema for value
            :param object key: An object to serialize
            :param str key_schema: Avro schema for key

            Plus any other parameters accepted by confluent_kafka.Producer.produce

            :raises SerializerError: On serialization failure
            :raises BufferError: If producer queue is full.
            :raises KafkaException: For other produce failures.
        """
        # get schemas from  kwargs if defined
        key_schema = kwargs.pop('key_schema', self._key_schema)
        value_schema = kwargs.pop('value_schema', self._value_schema)
        topic = kwargs.pop('topic', None)
        if not topic:
            raise ClientError("Topic name not specified.")
        value = kwargs.pop('value', None)
        key = kwargs.pop('key', None)

        if value is not None:
            if value_schema:
                value = self._serializer.encode_record_with_schema(topic, value_schema, value)
            else:
                raise ValueSerializerError("Avro schema required for values")

        if key is not None:
            if key_schema:
                key = self._serializer.encode_record_with_schema(topic, key_schema, key, True)
            else:
                raise KeySerializerError("Avro schema required for key")

        super(AvroProducer, self).produce(topic, value, key, **kwargs)
コード例 #34
0
from pyspark.sql import SQLContext, SparkSession

from pyspark.streaming import StreamingContext
from confluent_kafka.avro.cached_schema_registry_client import CachedSchemaRegistryClient
from confluent_kafka.avro.serializer.message_serializer import MessageSerializer

from pyspark.streaming.kafka import KafkaUtils

import json

var_schema_url = 'http://localhost:8081'
var_kafka_parms_src = {"metadata.broker.list": 'localhost:9092'}

schema_registry_client = CachedSchemaRegistryClient(var_schema_url)
serializer = MessageSerializer(schema_registry_client)

spark = SparkSession.builder \
  .appName('Advertiser_stream') \
  .master('local[*]') \
  .getOrCreate()


def handler(message):
    records = message.collect()
    for record in records:
        var_val_key = record[0]
        var_val_value = record[1]
        print(type(var_val_key))
        print(type(var_val_value))

コード例 #35
0
def test_kafka_destination_expression_partitioner_avro(sdc_builder,
                                                       sdc_executor, cluster,
                                                       confluent):
    """This test ensures that the correct serializer is set when producing AVRO records and using
    EXPRESSION partition strategy. We do so by setting the confluent serializer in the stage config, and also
    setting it to the kafka consumer used in the test. The consumer won't be able to deserialize the records
    if they're not serialized in AVRO.
    """
    topic = get_random_string(string.ascii_letters, 10)
    logger.debug('Kafka topic name: %s', topic)

    data = {'myLongField1': 'My Long Message'}

    # Build the Kafka destination pipeline.
    builder = sdc_builder.get_pipeline_builder()
    builder.add_error_stage('Discard')

    source = builder.add_stage('Dev Raw Data Source')
    source.set_attributes(stop_after_first_batch=True,
                          data_format='JSON',
                          raw_data=json.dumps(data))

    destination = builder.add_stage(
        name='com_streamsets_pipeline_stage_destination_kafka_KafkaDTarget',
        library=cluster.kafka.standalone_stage_lib)
    # Set configuration to use AVRO with a registered schema in confluent, and expression partition strategy
    destination.set_attributes(topic=topic,
                               data_format='AVRO',
                               message_key_format='AVRO',
                               avro_schema_location='REGISTRY',
                               lookup_schema_by='SUBJECT',
                               schema_subject=f'{topic}-value',
                               include_schema=False,
                               partition_strategy='EXPRESSION',
                               partition_expression='${0}',
                               kafka_message_key='',
                               key_serializer='CONFLUENT',
                               value_serializer='CONFLUENT')

    source >> destination
    pipeline = builder\
        .build(title='Kafka Destination pipeline with Expression Partitioner')\
        .configure_for_environment(cluster, confluent)

    sdc_executor.add_pipeline(pipeline)

    # Create the avro schema and register it to confluent
    field = avro.schema.Field(type=avro.schema.PrimitiveSchema(
        avro.schema.STRING),
                              name='myLongField1',
                              index=0,
                              has_default=False)
    schema = avro.schema.RecordSchema(name=f'value_{topic}',
                                      namespace=None,
                                      fields=[field],
                                      names=avro.schema.Names())
    confluent.schema_registry.register(f'{topic}-value', schema)

    # Set the confluent serializer to the kafka consumer
    serializer = MessageSerializer(confluent.schema_registry)
    consumer = cluster.kafka.consumer(
        consumer_timeout_ms=1000,
        auto_offset_reset='earliest',
        key_deserializer=partial(serializer.decode_message, is_key=True),
        value_deserializer=partial(serializer.decode_message, is_key=False))
    consumer.subscribe([topic])

    sdc_executor.start_pipeline(pipeline).wait_for_finished()

    msgs_received = [message for message in consumer]

    assert 1 == len(msgs_received)
    assert [message.value for message in msgs_received] == [data]