def setUp(self): # need to set up the serializer # Make RecordSchema and PrimitiveSchema hashable schema.RecordSchema.__hash__ = self.hash_func schema.PrimitiveSchema.__hash__ = self.hash_func self.client = MockSchemaRegistryClient() self.ms = MessageSerializer(self.client)
def __init__(self, config, default_key_schema=None, default_value_schema=None, schema_registry=None, subject_name_strategy=SubjectNameStrategy.RecordNameStrategy ): sr_conf = {key.replace("schema.registry.", ""): value for key, value in config.items() if key.startswith("schema.registry")} if sr_conf.get("basic.auth.credentials.source") == 'SASL_INHERIT': # Fallback to plural 'mechanisms' for backward compatibility sr_conf['sasl.mechanism'] = config.get('sasl.mechanism', config.get('sasl.mechanisms', '')) sr_conf['sasl.username'] = config.get('sasl.username', '') sr_conf['sasl.password'] = config.get('sasl.password', '') sr_conf['auto.register.schemas'] = config.get('auto.register.schemas', True) ap_conf = {key: value for key, value in config.items() if not key.startswith("schema.registry")} if schema_registry is None: schema_registry = CachedSchemaRegistryClient(sr_conf) elif sr_conf.get("url", None) is not None: raise ValueError("Cannot pass schema_registry along with schema.registry.url config") super(AvroProducer, self).__init__(ap_conf) self._serializer = MessageSerializer(schema_registry, subject_name_strategy=subject_name_strategy) self._key_schema = default_key_schema self._value_schema = default_value_schema
def __init__(self, config, default_key_schema=None, default_value_schema=None, schema_registry=None): sr_conf = { key.replace("schema.registry.", ""): value for key, value in config.items() if key.startswith("schema.registry") } if sr_conf.get("basic.auth.credentials.source") == 'SASL_INHERIT': sr_conf['sasl.mechanisms'] = config.get('sasl.mechanisms', '') sr_conf['sasl.username'] = config.get('sasl.username', '') sr_conf['sasl.password'] = config.get('sasl.password', '') ap_conf = { key: value for key, value in config.items() if not key.startswith("schema.registry") } if schema_registry is None: schema_registry = CachedSchemaRegistryClient(sr_conf) elif sr_conf.get("url", None) is not None: raise ValueError( "Cannot pass schema_registry along with schema.registry.url config" ) super(AvroProducer, self).__init__(ap_conf) self._serializer = MessageSerializer(schema_registry) self._key_schema = default_key_schema self._value_schema = default_value_schema
def __init__(self, config, default_key_schema=None, default_value_schema=None, schema_registry=None): schema_registry_url = config.pop("schema.registry.url", None) schema_registry_ca_location = config.pop( "schema.registry.ssl.ca.location", None) schema_registry_certificate_location = config.pop( "schema.registry.ssl.certificate.location", None) schema_registry_key_location = config.pop( "schema.registry.ssl.key.location", None) if schema_registry is None: if schema_registry_url is None: raise ValueError("Missing parameter: schema.registry.url") schema_registry = CachedSchemaRegistryClient( url=schema_registry_url, ca_location=schema_registry_ca_location, cert_location=schema_registry_certificate_location, key_location=schema_registry_key_location) elif schema_registry_url is not None: raise ValueError( "Cannot pass schema_registry along with schema.registry.url config" ) super(AvroProducer, self).__init__(config) self._serializer = MessageSerializer(schema_registry) self._key_schema = default_key_schema self._value_schema = default_value_schema
def produce(config, topic, input_messages): """ produce initiate sending a message to Kafka, call the produce method passing in the input_messages key/value and and callback Parameters ---------- topic: str topic where the input message publish too input_messages: dict a key/value input messages config: dict the config values that needed by the produce """ if topic is None: logger.debug('Required topic field must be set') raise ValueError() if len(input_messages) <= 0: logger.debug('Required data field must not be empty.') raise ValueError() bootstrap_servers, schema_registry = producer_config(config) producer = Producer(bootstrap_servers) admin_client = AdminClient(bootstrap_servers) topics = admin_client.list_topics().topics #Just to show what's available print(topics) if not topics: print('Not Topics') raise RuntimeError() sr = CachedSchemaRegistryClient(schema_registry) ser = MessageSerializer(sr) # get schema id, schema, version = sr.get_latest_schema(topic + "-value") if schema: print('In If Schema') for key, value in input_messages.items(): if validate_uuid4(key): print('In validate in For loop') serializedMessage = ser.encode_record_with_schema( topic, schema, value) producer.produce(topic=topic, key=key, value=serializedMessage, callback=acked) # producer.flush() # bad idea, it limits throughput to the broker round trip time producer.poll(1) else: print('In Else of For Loop') logger.error('Invalid UUID String: ', key) else: print('Schema not found for topic name: ', topic) print('In Else Schema') sys.exit(1)
class AvroProducer(Producer): """ Kafka Producer client which does avro schema encoding to messages. Handles schema registration, Message serialization. Constructor takes below parameters @:param: config: dict object with config parameters containing url for schema registry (schema.registry.url). @:param: default_key_schema: Optional avro schema for key @:param: default_value_schema: Optional avro schema for value """ def __init__(self, config, default_key_schema=None, default_value_schema=None): if ('schema.registry.url' not in config.keys()): raise ValueError("Missing parameter: schema.registry.url") schem_registry_url = config["schema.registry.url"] del config["schema.registry.url"] super(AvroProducer, self).__init__(config) self._serializer = MessageSerializer( CachedSchemaRegistryClient(url=schem_registry_url)) self._key_schema = default_key_schema self._value_schema = default_value_schema def produce(self, **kwargs): """ Sends message to kafka by encoding with specified avro schema @:param: topic: topic name @:param: value: A dictionary object @:param: value_schema : Avro schema for value @:param: key: A dictionary object @:param: key_schema : Avro schema for key @:exception: SerializerError """ # get schemas from kwargs if defined key_schema = kwargs.pop('key_schema', self._key_schema) value_schema = kwargs.pop('value_schema', self._value_schema) topic = kwargs.pop('topic', None) if not topic: raise ClientError("Topic name not specified.") value = kwargs.pop('value', None) key = kwargs.pop('key', None) if value: if value_schema: value = self._serializer.encode_record_with_schema( topic, value_schema, value) else: raise SerializerError("Avro schema required for value") if key: if key_schema: key = self._serializer.encode_record_with_schema( topic, key_schema, key, True) else: raise SerializerError("Avro schema required for key") super(AvroProducer, self).produce(topic, value, key, **kwargs)
class AvroProducer(object): def __init__(self, config, default_key_schema=None, default_value_schema=None, schema_registry=None): schema_registry_url = config.pop("schema.registry.url", None) schema_registry_ca_location = config.pop( "schema.registry.ssl.ca.location", None) schema_registry_certificate_location = config.pop( "schema.registry.ssl.certificate.location", None) schema_registry_key_location = config.pop( "schema.registry.ssl.key.location", None) if schema_registry is None: if schema_registry_url is None: raise ValueError("Missing parameter: schema.registry.url") schema_registry = CachedSchemaRegistryClient( url=schema_registry_url, ca_location=schema_registry_ca_location, cert_location=schema_registry_certificate_location, key_location=schema_registry_key_location) elif schema_registry_url is not None: raise ValueError( "Cannot pass schema_registry along with schema.registry.url config" ) self.producer = Producer(config) self._serializer = MessageSerializer(schema_registry) self._key_schema = default_key_schema self._value_schema = default_value_schema def flush(self): self.producer.flush() def produce(self, **kwargs): key_schema = kwargs.pop('key_schema', self._key_schema) value_schema = kwargs.pop('value_schema', self._value_schema) topic = kwargs.pop('topic', None) if not topic: raise ClientError("Topic name not specified.") value = kwargs.pop('value', None) key = kwargs.pop('key', None) if value is not None: if value_schema: value = self._serializer.encode_record_with_schema( topic, value_schema, value) else: raise ValueSerializerError("Avro schema required for values") if key is not None: if key_schema: key = self._serializer.encode_record_with_schema( topic, key_schema, key, True) self.producer.produce(topic, value, key, **kwargs)
class AvroConsumer(Consumer): """ Kafka Consumer client which does avro schema decoding of messages. Handles message deserialization. Constructor takes below parameters :param dict config: Config parameters containing url for schema registry (``schema.registry.url``) and the standard Kafka client configuration (``bootstrap.servers`` et.al). """ def __init__(self, config, schema_registry=None): schema_registry_url = config.pop("schema.registry.url", None) schema_registry_ca_location = config.pop("schema.registry.ssl.ca.location", None) schema_registry_certificate_location = config.pop("schema.registry.ssl.certificate.location", None) schema_registry_key_location = config.pop("schema.registry.ssl.key.location", None) if schema_registry is None: if schema_registry_url is None: raise ValueError("Missing parameter: schema.registry.url") schema_registry = CachedSchemaRegistryClient(url=schema_registry_url, ca_location=schema_registry_ca_location, cert_location=schema_registry_certificate_location, key_location=schema_registry_key_location) elif schema_registry_url is not None: raise ValueError("Cannot pass schema_registry along with schema.registry.url config") super(AvroConsumer, self).__init__(config) self._serializer = MessageSerializer(schema_registry) def poll(self, timeout=None, with_schema=False): """ This is an overriden method from confluent_kafka.Consumer class. This handles message deserialization using avro schema :param float timeout: Poll timeout in seconds (default: indefinite) :param boolean with_schema: If true, the key_schema and value_schema are added as properties of the message (default: False) :returns: message object with deserialized key and value as dict objects :rtype: Message or AvroMessage """ if timeout is None: timeout = -1 message = super(AvroConsumer, self).poll(timeout) key_schema = value_schema = None if message is None: return None if not message.value() and not message.key(): return message if not message.error(): if message.value() is not None: decoded_value, value_schema = self._serializer.decode_message(message.value()) message.set_value(decoded_value) if message.key() is not None: decoded_key, key_schema = self._serializer.decode_message(message.key()) message.set_key(decoded_key) return message if not with_schema else AvroMessage(key_schema, value_schema, message)
def __init__(self, config): if ('schema.registry.url' not in config.keys()): raise ValueError("Missing parameter: schema.registry.url") schem_registry_url = config["schema.registry.url"] del config["schema.registry.url"] super(AvroConsumer, self).__init__(config) self._serializer = MessageSerializer(CachedSchemaRegistryClient(url=schem_registry_url))
def __init__(self, config, default_key_schema=None, default_value_schema=None): if ('schema.registry.url' not in config.keys()): raise ValueError("Missing parameter: schema.registry.url") schem_registry_url = config["schema.registry.url"] del config["schema.registry.url"] super(AvroProducer, self).__init__(config) self._serializer = MessageSerializer(CachedSchemaRegistryClient(url=schem_registry_url)) self._key_schema = default_key_schema self._value_schema = default_value_schema
class TestMessageSerializer(unittest.TestCase): def setUp(self): # need to set up the serializer self.client = MockSchemaRegistryClient() self.ms = MessageSerializer(self.client) def assertMessageIsSame(self, message, expected, schema_id, schema): self.assertTrue(message) self.assertTrue(len(message) > 5) magic, sid = struct.unpack('>bI', message[0:5]) self.assertEqual(magic, 0) self.assertEqual(sid, schema_id) decoded_msg, decoded_schema = self.ms.decode_message(message) self.assertTrue(decoded_msg) self.assertEqual(decoded_msg, expected) self.assertEqual(decoded_schema, schema) def test_encode_with_schema_id(self): adv = avro.loads(data_gen.ADVANCED_SCHEMA) basic = avro.loads(data_gen.BASIC_SCHEMA) subject = 'test' schema_id = self.client.register(subject, basic) records = data_gen.BASIC_ITEMS for record in records: message = self.ms.encode_record_with_schema_id(schema_id, record) self.assertMessageIsSame(message, record, schema_id, basic) subject = 'test_adv' adv_schema_id = self.client.register(subject, adv) self.assertNotEqual(adv_schema_id, schema_id) records = data_gen.ADVANCED_ITEMS for record in records: message = self.ms.encode_record_with_schema_id( adv_schema_id, record) self.assertMessageIsSame(message, record, adv_schema_id, adv) def test_encode_record_with_schema(self): topic = 'test' basic = avro.loads(data_gen.BASIC_SCHEMA) subject = 'test-value' schema_id = self.client.register(subject, basic) records = data_gen.BASIC_ITEMS for record in records: message = self.ms.encode_record_with_schema(topic, basic, record) self.assertMessageIsSame(message, record, schema_id, basic) def test_decode_none(self): """"null/None messages should decode to None""" self.assertIsNone(self.ms.decode_message(None)) def hash_func(self): return hash(str(self))
def read_from_offset(self, offset=0, lang='json', schema=None): ''' Kafka read message Read json and avro messages from consumer ''' log.debug("[KafkaDriver][read_from_offset] lang: " + str(lang)) log.debug("[KafkaDriver][read_from_offset] offset: " + str(offset)) def outputJSON(obj): ''' Default JSON serializer. ''' if isinstance(obj, datetime.datetime): return int(obj.strftime("%s%f")[:-3]) return obj ret = None log.debug("[KafkaDriver][read_from_offset] read start: " + str(self.server)) consumer = KafkaConsumer(bootstrap_servers=self.server + ':9092', auto_offset_reset='earliest', consumer_timeout_ms=1000) partition = TopicPartition(self.topic, 0) consumer.assign([partition]) consumer.seek_to_end(partition) start = int(offset) consumer.seek(partition, offset) for msg in consumer: if (lang == 'avro'): #message = AvroDecoder.decode(schema, msg.value) schema_registry = CachedSchemaRegistryClient(url='http://' + self.schema_registry + ':8081') self._serializer = MessageSerializer(schema_registry) message = self._serializer.decode_message(msg.value) message = json.dumps(message, indent=4, sort_keys=True, default=outputJSON) #log.debug("[KafkaDriver][read_from_offset] avro message: " + str(message)) ret = message else: message = msg.value #log.debug("[KafkaDriver][read_from_offset] other message: " + str(message)) ret = msg.value log.debug("[KafkaDriver][read_from_offset] msg: " + str(message) + " msg.offset: " + str(msg.offset)) consumer.close() log.debug("[KafkaDriver][read_from_offset] read end") return ret
def __init__(self, schema_registry_url): """Private implementation class for Avro IO using the registry""" log.info( f"Using registry with schema_url/id {schema_registry_url}/{config.SCHEMA_ID}" ) try: self.client = CachedSchemaRegistryClient(url=schema_registry_url) self.schema = self.client.get_by_id(config.SCHEMA_ID) self.serializer = MessageSerializer(self.client) except: raise ValueError("Client id or schema id not found")
def __init__(self, producer, schema_registry_url, default_key_schema=None, default_value_schema=None ): # real signature unknown; restored from __doc__ self._producer = producer self._serializer = MessageSerializer( CachedSchemaRegistryClient(url=schema_registry_url)) self.key_schema = default_key_schema self.value_schema = default_value_schema
def consume(config, topic, handler): """ Starts a consumer and calls the given handler for each consumed message. Assumes that keys are serialized as strings and values are serialized as Avro objects with their schemas stored in a Confluent Schema Registry. """ c_conf = {} for key, value in config.items(): if not key.startswith("schema.registry"): if not value is None: c_conf[key] = value.strip() if "auto.offset.reset" in c_conf: print("offset provided") else: c_conf['auto.offset.reset'] = 'earliest' if "group.id" in c_conf: print("group id provided") else: c_conf['group.id'] = 'sme_test' c = Consumer(c_conf) c.subscribe([topic]) sr_conf = { key.replace("schema.registry.", ""): value.strip() for key, value in config.items() if key.startswith("schema.registry") } sr = CachedSchemaRegistryClient(sr_conf) ser = MessageSerializer(sr) while True: try: msg = c.poll(10) if msg is None: print('No Messages') continue if msg.error(): log.error("Consumer error: {}".format(msg.error())) continue key = msg.key().decode('utf-8') value = ser.decode_message(msg.value(), is_key=False) except Exception as e: log.error("Message consumption failed: {}".format(e)) break try: handler(key, value) except Exception as e: log.error("Message handler failed: {}".format(e)) break c.close()
class TestMessageSerializer(unittest.TestCase): def setUp(self): # need to set up the serializer self.client = MockSchemaRegistryClient() self.ms = MessageSerializer(self.client) def assertMessageIsSame(self, message, expected, schema_id): self.assertTrue(message) self.assertTrue(len(message) > 5) magic, sid = struct.unpack('>bI', message[0:5]) self.assertEqual(magic, 0) self.assertEqual(sid, schema_id) decoded = self.ms.decode_message(message) self.assertTrue(decoded) self.assertEqual(decoded, expected) def test_encode_with_schema_id(self): adv = avro.loads(data_gen.ADVANCED_SCHEMA) basic = avro.loads(data_gen.BASIC_SCHEMA) subject = 'test' schema_id = self.client.register(subject, basic) records = data_gen.BASIC_ITEMS for record in records: message = self.ms.encode_record_with_schema_id(schema_id, record) self.assertMessageIsSame(message, record, schema_id) subject = 'test_adv' adv_schema_id = self.client.register(subject, adv) self.assertNotEqual(adv_schema_id, schema_id) records = data_gen.ADVANCED_ITEMS for record in records: message = self.ms.encode_record_with_schema_id(adv_schema_id, record) self.assertMessageIsSame(message, record, adv_schema_id) def test_encode_record_with_schema(self): topic = 'test' basic = avro.loads(data_gen.BASIC_SCHEMA) subject = 'test-value' schema_id = self.client.register(subject, basic) records = data_gen.BASIC_ITEMS for record in records: message = self.ms.encode_record_with_schema(topic, basic, record) self.assertMessageIsSame(message, record, schema_id) def test_decode_none(self): """"null/None messages should decode to None""" self.assertIsNone(self.ms.decode_message(None)) def hash_func(self): return hash(str(self))
class TestMessageSerializer(unittest.TestCase): def setUp(self): # need to set up the serializer # Make RecordSchema and PrimitiveSchema hashable schema.RecordSchema.__hash__ = self.hash_func schema.PrimitiveSchema.__hash__ = self.hash_func self.client = MockSchemaRegistryClient() self.ms = MessageSerializer(self.client) def assertMessageIsSame(self, message, expected, schema_id): self.assertTrue(message) self.assertTrue(len(message) > 5) magic, sid = struct.unpack('>bI', message[0:5]) self.assertEqual(magic, 0) self.assertEqual(sid, schema_id) decoded = self.ms.decode_message(message) self.assertTrue(decoded) self.assertEqual(decoded, expected) def test_encode_with_schema_id(self): adv = util.parse_schema_from_string(data_gen.ADVANCED_SCHEMA) basic = util.parse_schema_from_string(data_gen.BASIC_SCHEMA) subject = 'test' schema_id = self.client.register(subject, basic) records = data_gen.BASIC_ITEMS for record in records: message = self.ms.encode_record_with_schema_id(schema_id, record) self.assertMessageIsSame(message, record, schema_id) subject = 'test_adv' adv_schema_id = self.client.register(subject, adv) self.assertNotEqual(adv_schema_id, schema_id) records = data_gen.ADVANCED_ITEMS for record in records: message = self.ms.encode_record_with_schema_id( adv_schema_id, record) self.assertMessageIsSame(message, record, adv_schema_id) def test_encode_record_with_schema(self): topic = 'test' basic = util.parse_schema_from_string(data_gen.BASIC_SCHEMA) subject = 'test-value' schema_id = self.client.register(subject, basic) records = data_gen.BASIC_ITEMS for record in records: message = self.ms.encode_record_with_schema(topic, basic, record) self.assertMessageIsSame(message, record, schema_id) def hash_func(self): return hash(str(self))
def __init__(self, config, schema_registry=None): schema_registry_url = config.pop("schema.registry.url", None) if schema_registry is None: if schema_registry_url is None: raise ValueError("Missing parameter: schema.registry.url") schema_registry = CachedSchemaRegistryClient( url=schema_registry_url) elif schema_registry_url is not None: raise ValueError( "Cannot pass schema_registry along with schema.registry.url config" ) super(AvroConsumer, self).__init__(config) self._serializer = MessageSerializer(schema_registry)
class AvroConsumer(Consumer): """ Kafka Consumer client which does avro schema decoding of messages. Handles message deserialization. Constructor takes below parameters @:param: config: dict object with config parameters containing url for schema registry (schema.registry.url). """ def __init__(self, config, schema_registry=None): self._enable_key_decoding = config.pop("enable.key.decoding", True) schema_registry_url = config.pop("schema.registry.url", None) if schema_registry is None: if schema_registry_url is None: raise ValueError("Missing parameter: schema.registry.url") schema_registry = CachedSchemaRegistryClient( url=schema_registry_url) elif schema_registry_url is not None: raise ValueError( "Cannot pass schema_registry along with schema.registry.url config" ) super(AvroConsumer, self).__init__(config) self._serializer = MessageSerializer(schema_registry) def poll(self, timeout=None): """ This is an overriden method from confluent_kafka.Consumer class. This handles message deserialization using avro schema @:param timeout @:return message object with deserialized key and value as dict objects """ if timeout is None: timeout = -1 message = super(AvroConsumer, self).poll(timeout) if message is None: return None if not message.value() and not message.key(): return message if not message.error(): if message.value() is not None: decoded_value = self._serializer.decode_message( message.value()) message.set_value(decoded_value) if self._enable_key_decoding and message.key() is not None: decoded_key = self._serializer.decode_message(message.key()) message.set_key(decoded_key) return message
class SimpleAvroDeserializer(Deserializer): def __init__(self, schema_registry_url): schema_registry = CachedSchemaRegistryClient( {'url': schema_registry_url}) self._serializer = MessageSerializer(schema_registry, None, None) def __call__(self, value, ctx=None): if value is None: return None if ctx is not None and ctx.field == 'key': decoded = self._serializer.decode_message(value, is_key=True) else: decoded = self._serializer.decode_message(value, is_key=False) return decoded
def printAndProduceMessages(self): consumer = AvroConsumer({ 'bootstrap.servers': BOOTSTRAP_SERVERS, 'group.id': GROUP_ID, 'auto.offset.reset': AUTO_OFFSET_RESET, 'enable.auto.commit': False, 'schema.registry.url': SCHEMA_REGISTRY_URL }) schema_registry = CachedSchemaRegistryClient( os.environ.get('SCHEMA_REGISTRY', SCHEMA_REGISTRY_URL)) avro_serde = AvroSerde(schema_registry) consumer.subscribe([INPUT_TOPIC_NAME]) while True: try: consumedMessages = consumer.consume( num_messages=CONSUMER_BATCH_SIZE, timeout=1) except Exception as e: logging.error("Message pool failed: {}".format(e)) break messages = [] for consumedMessage in consumedMessages: consumedMessageValue = avro_serde.decode_message( consumedMessage.value()) message = {} message["key"] = {} message["value"] = {} for attr, value in consumedMessageValue.items(): if attr != ARRAY_NAME: message["value"][DOCUMENT_FIELD_PREFIX + attr] = value for arrayItem in consumedMessageValue[ARRAY_NAME]: message["key"]["id"] = consumedMessageValue["id"] + \ "-" + arrayItem["id"] for attr, value in arrayItem.items(): message["value"][attr] = value messages.append(message) self.produceMessages(messages) consumer.commit() consumer.close()
class _AvroIORegistry: def __init__(self, schema_registry_url): """Private implementation class for Avro IO using the registry""" log.info( f"Using registry with schema_url/id {schema_registry_url}/{config.SCHEMA_ID}" ) try: self.client = CachedSchemaRegistryClient(url=schema_registry_url) self.schema = self.client.get_by_id(config.SCHEMA_ID) self.serializer = MessageSerializer(self.client) except: raise ValueError("Client id or schema id not found") def decode(self, bytes): return self.serializer.decode_message(bytes) def encode(self, record): return self.serializer.encode_record_with_schema_id( config.SCHEMA_ID, record)
def __init__(self, config, schema_registry=None, reader_key_schema=None, reader_value_schema=None): sr_conf = {key.replace("schema.registry.", ""): value for key, value in config.items() if key.startswith("schema.registry")} if sr_conf.get("basic.auth.credentials.source") == 'SASL_INHERIT': # Fallback to plural 'mechanisms' for backward compatibility sr_conf['sasl.mechanism'] = config.get('sasl.mechanism', config.get('sasl.mechanisms', '')) sr_conf['sasl.username'] = config.get('sasl.username', '') sr_conf['sasl.password'] = config.get('sasl.password', '') ap_conf = {key: value for key, value in config.items() if not key.startswith("schema.registry")} if schema_registry is None: schema_registry = CachedSchemaRegistryClient(sr_conf) elif sr_conf.get("url", None) is not None: raise ValueError("Cannot pass schema_registry along with schema.registry.url config") super(AvroConsumer, self).__init__(ap_conf) self._serializer = MessageSerializer(schema_registry, reader_key_schema, reader_value_schema)
def test_select(started_cluster): # type: (ClickHouseCluster) -> None schema_registry_client = CachedSchemaRegistryClient( 'http://localhost:{}'.format(started_cluster.schema_registry_port)) serializer = MessageSerializer(schema_registry_client) schema = avro.schema.make_avsc_object({ 'name': 'test_record', 'type': 'record', 'fields': [{ 'name': 'value', 'type': 'long' }] }) buf = io.BytesIO() for x in range(0, 3): message = serializer.encode_record_with_schema('test_subject', schema, {'value': x}) buf.write(message) data = buf.getvalue() instance = started_cluster.instances["dummy"] # type: ClickHouseInstance schema_registry_url = "http://{}:{}".format( started_cluster.schema_registry_host, 8081) run_query(instance, "create table avro_data(value Int64) engine = Memory()") settings = {'format_avro_schema_registry_url': schema_registry_url} run_query(instance, "insert into avro_data format AvroConfluent", data, settings) stdout = run_query(instance, "select * from avro_data") assert list(map(str.split, stdout.splitlines())) == [ ["0"], ["1"], ["2"], ]
class AvroConsumer(Consumer): """ Kafka Consumer client which does avro schema decoding of messages. Handles message deserialization. Constructor takes below parameters @:param: config: dict object with config parameters containing url for schema registry (schema.registry.url). """ def __init__(self, config): if ('schema.registry.url' not in config.keys()): raise ValueError("Missing parameter: schema.registry.url") schem_registry_url = config["schema.registry.url"] del config["schema.registry.url"] super(AvroConsumer, self).__init__(config) self._serializer = MessageSerializer(CachedSchemaRegistryClient(url=schem_registry_url)) def poll(self, timeout): """ This is an overriden method from confluent_kafka.Consumer class. This handles message deserialization using avro schema @:param timeout @:return message object with deserialized key and value as dict objects """ message = super(AvroConsumer, self).poll(timeout) if not message: return message if not message.error(): if message.value() is not None: decoded_value = self._serializer.decode_message(message.value()) message.set_value(decoded_value) if message.key() is not None: decoded_key = self._serializer.decode_message(message.key()) message.set_key(decoded_key) return message
def __init__(self, config, default_key_schema=None, default_value_schema=None, schema_registry=None): schema_registry_url = config.pop("schema.registry.url", None) if schema_registry is None: if schema_registry_url is None: raise ValueError("Missing parameter: schema.registry.url") schema_registry = CachedSchemaRegistryClient( url=schema_registry_url) elif schema_registry_url is not None: raise ValueError( "Cannot pass schema_registry along with schema.registry.url config" ) super(AvroProducer, self).__init__(config) self._key_serializer = config.pop("key.serializer", MessageSerializer(schema_registry)) self._value_serializer = config.pop("key.serializer", MessageSerializer(schema_registry)) self._key_schema = default_key_schema self._value_schema = default_value_schema
def __init__(self, config, schema_registry=None, reader_key_schema=None, reader_value_schema=None): sr_conf = {key.replace("schema.registry.", ""): value for key, value in config.items() if key.startswith("schema.registry")} if sr_conf.get("basic.auth.credentials.source") == 'SASL_INHERIT': sr_conf['sasl.mechanisms'] = config.get('sasl.mechanisms', '') sr_conf['sasl.username'] = config.get('sasl.username', '') sr_conf['sasl.password'] = config.get('sasl.password', '') ap_conf = {key: value for key, value in config.items() if not key.startswith("schema.registry")} if schema_registry is None: schema_registry = CachedSchemaRegistryClient(sr_conf) elif sr_conf.get("url", None) is not None: raise ValueError("Cannot pass schema_registry along with schema.registry.url config") super(AvroConsumer, self).__init__(ap_conf) self._serializer = MessageSerializer(schema_registry, reader_key_schema, reader_value_schema)
class AvroProducer(Producer): """ Kafka Producer client which does avro schema encoding to messages. Handles schema registration, Message serialization. Constructor takes below parameters. :param dict config: Config parameters containing url for schema registry (``schema.registry.url``) and the standard Kafka client configuration (``bootstrap.servers`` et.al). :param str default_key_schema: Optional default avro schema for key :param str default_value_schema: Optional default avro schema for value """ def __init__(self, config, default_key_schema=None, default_value_schema=None, schema_registry=None): schema_registry_url = config.pop("schema.registry.url", None) schema_registry_ca_location = config.pop( "schema.registry.ssl.ca.location", None) schema_registry_certificate_location = config.pop( "schema.registry.ssl.certificate.location", None) schema_registry_key_location = config.pop( "schema.registry.ssl.key.location", None) if schema_registry is None: if schema_registry_url is None: raise ValueError("Missing parameter: schema.registry.url") schema_registry = CachedSchemaRegistryClient( url=schema_registry_url, ca_location=schema_registry_ca_location, cert_location=schema_registry_certificate_location, key_location=schema_registry_key_location) elif schema_registry_url is not None: raise ValueError( "Cannot pass schema_registry along with schema.registry.url config" ) super(AvroProducer, self).__init__(config) self._serializer = MessageSerializer(schema_registry) self._key_schema = default_key_schema self._value_schema = default_value_schema def produce(self, **kwargs): """ Asynchronously sends message to Kafka by encoding with specified or default avro schema. :param str topic: topic name :param object value: An object to serialize :param str value_schema: Avro schema for value :param object key: An object to serialize :param str key_schema: Avro schema for key Plus any other parameters accepted by confluent_kafka.Producer.produce :raises SerializerError: On serialization failure :raises BufferError: If producer queue is full. :raises KafkaException: For other produce failures. """ # get schemas from kwargs if defined key_schema = kwargs.pop('key_schema', self._key_schema) value_schema = kwargs.pop('value_schema', self._value_schema) topic = kwargs.pop('topic', None) if not topic: raise ClientError("Topic name not specified.") value = kwargs.pop('value', None) key = kwargs.pop('key', None) if value is not None: if value_schema: value = self._serializer.encode_record_with_schema( topic, value_schema, value) else: raise ValueSerializerError("Avro schema required for values") if key is not None: if key_schema: key = self._serializer.encode_record_with_schema( topic, key_schema, key, True) else: raise KeySerializerError("Avro schema required for key") super(AvroProducer, self).produce(topic, value, key, **kwargs)
def setUp(self): # need to set up the serializer self.client = MockSchemaRegistryClient() self.ms = MessageSerializer(self.client)
class TestMessageSerializer(unittest.TestCase): def setUp(self): # need to set up the serializer self.client = MockSchemaRegistryClient() self.ms = MessageSerializer(self.client) def assertMessageIsSame(self, message, expected, schema_id): self.assertTrue(message) self.assertTrue(len(message) > 5) magic, sid = struct.unpack('>bI', message[0:5]) self.assertEqual(magic, 0) self.assertEqual(sid, schema_id) decoded = self.ms.decode_message(message) self.assertTrue(decoded) self.assertEqual(decoded, expected) def test_encode_with_schema_id(self): adv = avro.loads(data_gen.ADVANCED_SCHEMA) basic = avro.loads(data_gen.BASIC_SCHEMA) subject = 'test' schema_id = self.client.register(subject, basic) records = data_gen.BASIC_ITEMS for record in records: message = self.ms.encode_record_with_schema_id(schema_id, record) self.assertMessageIsSame(message, record, schema_id) subject = 'test_adv' adv_schema_id = self.client.register(subject, adv) self.assertNotEqual(adv_schema_id, schema_id) records = data_gen.ADVANCED_ITEMS for record in records: message = self.ms.encode_record_with_schema_id( adv_schema_id, record) self.assertMessageIsSame(message, record, adv_schema_id) def test_encode_record_with_schema(self): topic = 'test' basic = avro.loads(data_gen.BASIC_SCHEMA) subject = 'test-value' schema_id = self.client.register(subject, basic) records = data_gen.BASIC_ITEMS for record in records: message = self.ms.encode_record_with_schema(topic, basic, record) self.assertMessageIsSame(message, record, schema_id) def test_decode_none(self): """"null/None messages should decode to None""" self.assertIsNone(self.ms.decode_message(None)) def test_decode_with_schema(self): topic = 'test_specific' schema_v1 = avro.loads( data_gen.load_schema_file('evolution_schema_v1.avsc')) schema_v2 = avro.loads( data_gen.load_schema_file('evolution_schema_v2.avsc')) dsv1 = SpecificRecordMessageDeserializer(self.client, value_schema=schema_v1) dsv2 = SpecificRecordMessageDeserializer(self.client, value_schema=schema_v2) record_v1 = {"name": "suzyq", "age": 27} record_v2 = dict(record_v1) record_v2['gender'] = 'NONE' encoded_v1 = self.ms.encode_record_with_schema(topic, schema_v1, record_v1) decoded_v1_v1 = dsv1.decode_message(encoded_v1, is_key=False) self.assertDictEqual(record_v1, decoded_v1_v1) decoded_v1_v2 = dsv2.decode_message(encoded_v1, is_key=False) self.assertDictEqual(record_v2, decoded_v1_v2) encoded_v2 = self.ms.encode_record_with_schema(topic, schema_v2, record_v2) decoded_v2_v2 = dsv2.decode_message(encoded_v2, is_key=False) self.assertDictEqual(record_v2, decoded_v2_v2) decoded_v2_v1 = dsv1.decode_message(encoded_v2, is_key=False) self.assertDictEqual(record_v1, decoded_v2_v1) def hash_func(self): return hash(str(self))
def setUp(self): # need to set up the serializer self.client = MockSchemaRegistryClient() self.ms = MessageSerializer(self.client)
class AvroConsumer(Consumer): """ Kafka Consumer client which does avro schema decoding of messages. Handles message deserialization. Constructor takes below parameters :param dict config: Config parameters containing url for schema registry (``schema.registry.url``) and the standard Kafka client configuration (``bootstrap.servers`` et.al) :param schema reader_key_schema: a reader schema for the message key :param schema reader_value_schema: a reader schema for the message value :raises ValueError: For invalid configurations """ def __init__(self, config, schema_registry=None, reader_key_schema=None, reader_value_schema=None): sr_conf = {key.replace("schema.registry.", ""): value for key, value in config.items() if key.startswith("schema.registry")} if sr_conf.get("basic.auth.credentials.source") == 'SASL_INHERIT': sr_conf['sasl.mechanisms'] = config.get('sasl.mechanisms', '') sr_conf['sasl.username'] = config.get('sasl.username', '') sr_conf['sasl.password'] = config.get('sasl.password', '') ap_conf = {key: value for key, value in config.items() if not key.startswith("schema.registry")} if schema_registry is None: schema_registry = CachedSchemaRegistryClient(sr_conf) elif sr_conf.get("url", None) is not None: raise ValueError("Cannot pass schema_registry along with schema.registry.url config") super(AvroConsumer, self).__init__(ap_conf) self._serializer = MessageSerializer(schema_registry, reader_key_schema, reader_value_schema) def poll(self, timeout=None): """ This is an overriden method from confluent_kafka.Consumer class. This handles message deserialization using avro schema :param float timeout: Poll timeout in seconds (default: indefinite) :returns: message object with deserialized key and value as dict objects :rtype: Message """ if timeout is None: timeout = -1 message = super(AvroConsumer, self).poll(timeout) if message is None: return None if not message.error(): try: if message.value() is not None: decoded_value = self._serializer.decode_message(message.value(), is_key=False) message.set_value(decoded_value) if message.key() is not None: decoded_key = self._serializer.decode_message(message.key(), is_key=True) message.set_key(decoded_key) except SerializerError as e: raise SerializerError("Message deserialization failed for message at {} [{}] offset {}: {}".format( message.topic(), message.partition(), message.offset(), e)) return message
class AvroProducer(Producer): """ Kafka Producer client which does avro schema encoding to messages. Handles schema registration, Message serialization. Constructor takes below parameters. :param dict config: Config parameters containing url for schema registry (``schema.registry.url``) and the standard Kafka client configuration (``bootstrap.servers`` et.al). :param str default_key_schema: Optional default avro schema for key :param str default_value_schema: Optional default avro schema for value """ def __init__(self, config, default_key_schema=None, default_value_schema=None, schema_registry=None): sr_conf = {key.replace("schema.registry.", ""): value for key, value in config.items() if key.startswith("schema.registry")} if sr_conf.get("basic.auth.credentials.source") == 'SASL_INHERIT': sr_conf['sasl.mechanisms'] = config.get('sasl.mechanisms', '') sr_conf['sasl.username'] = config.get('sasl.username', '') sr_conf['sasl.password'] = config.get('sasl.password', '') ap_conf = {key: value for key, value in config.items() if not key.startswith("schema.registry")} if schema_registry is None: schema_registry = CachedSchemaRegistryClient(sr_conf) elif sr_conf.get("url", None) is not None: raise ValueError("Cannot pass schema_registry along with schema.registry.url config") super(AvroProducer, self).__init__(ap_conf) self._serializer = MessageSerializer(schema_registry) self._key_schema = default_key_schema self._value_schema = default_value_schema def produce(self, **kwargs): """ Asynchronously sends message to Kafka by encoding with specified or default avro schema. :param str topic: topic name :param object value: An object to serialize :param str value_schema: Avro schema for value :param object key: An object to serialize :param str key_schema: Avro schema for key Plus any other parameters accepted by confluent_kafka.Producer.produce :raises SerializerError: On serialization failure :raises BufferError: If producer queue is full. :raises KafkaException: For other produce failures. """ # get schemas from kwargs if defined key_schema = kwargs.pop('key_schema', self._key_schema) value_schema = kwargs.pop('value_schema', self._value_schema) topic = kwargs.pop('topic', None) if not topic: raise ClientError("Topic name not specified.") value = kwargs.pop('value', None) key = kwargs.pop('key', None) if value is not None: if value_schema: value = self._serializer.encode_record_with_schema(topic, value_schema, value) else: raise ValueSerializerError("Avro schema required for values") if key is not None: if key_schema: key = self._serializer.encode_record_with_schema(topic, key_schema, key, True) else: raise KeySerializerError("Avro schema required for key") super(AvroProducer, self).produce(topic, value, key, **kwargs)
from pyspark.sql import SQLContext, SparkSession from pyspark.streaming import StreamingContext from confluent_kafka.avro.cached_schema_registry_client import CachedSchemaRegistryClient from confluent_kafka.avro.serializer.message_serializer import MessageSerializer from pyspark.streaming.kafka import KafkaUtils import json var_schema_url = 'http://localhost:8081' var_kafka_parms_src = {"metadata.broker.list": 'localhost:9092'} schema_registry_client = CachedSchemaRegistryClient(var_schema_url) serializer = MessageSerializer(schema_registry_client) spark = SparkSession.builder \ .appName('Advertiser_stream') \ .master('local[*]') \ .getOrCreate() def handler(message): records = message.collect() for record in records: var_val_key = record[0] var_val_value = record[1] print(type(var_val_key)) print(type(var_val_value))
def test_kafka_destination_expression_partitioner_avro(sdc_builder, sdc_executor, cluster, confluent): """This test ensures that the correct serializer is set when producing AVRO records and using EXPRESSION partition strategy. We do so by setting the confluent serializer in the stage config, and also setting it to the kafka consumer used in the test. The consumer won't be able to deserialize the records if they're not serialized in AVRO. """ topic = get_random_string(string.ascii_letters, 10) logger.debug('Kafka topic name: %s', topic) data = {'myLongField1': 'My Long Message'} # Build the Kafka destination pipeline. builder = sdc_builder.get_pipeline_builder() builder.add_error_stage('Discard') source = builder.add_stage('Dev Raw Data Source') source.set_attributes(stop_after_first_batch=True, data_format='JSON', raw_data=json.dumps(data)) destination = builder.add_stage( name='com_streamsets_pipeline_stage_destination_kafka_KafkaDTarget', library=cluster.kafka.standalone_stage_lib) # Set configuration to use AVRO with a registered schema in confluent, and expression partition strategy destination.set_attributes(topic=topic, data_format='AVRO', message_key_format='AVRO', avro_schema_location='REGISTRY', lookup_schema_by='SUBJECT', schema_subject=f'{topic}-value', include_schema=False, partition_strategy='EXPRESSION', partition_expression='${0}', kafka_message_key='', key_serializer='CONFLUENT', value_serializer='CONFLUENT') source >> destination pipeline = builder\ .build(title='Kafka Destination pipeline with Expression Partitioner')\ .configure_for_environment(cluster, confluent) sdc_executor.add_pipeline(pipeline) # Create the avro schema and register it to confluent field = avro.schema.Field(type=avro.schema.PrimitiveSchema( avro.schema.STRING), name='myLongField1', index=0, has_default=False) schema = avro.schema.RecordSchema(name=f'value_{topic}', namespace=None, fields=[field], names=avro.schema.Names()) confluent.schema_registry.register(f'{topic}-value', schema) # Set the confluent serializer to the kafka consumer serializer = MessageSerializer(confluent.schema_registry) consumer = cluster.kafka.consumer( consumer_timeout_ms=1000, auto_offset_reset='earliest', key_deserializer=partial(serializer.decode_message, is_key=True), value_deserializer=partial(serializer.decode_message, is_key=False)) consumer.subscribe([topic]) sdc_executor.start_pipeline(pipeline).wait_for_finished() msgs_received = [message for message in consumer] assert 1 == len(msgs_received) assert [message.value for message in msgs_received] == [data]