def _get_decoder_func(self, schema_id, payload, is_key=False): if schema_id in self.id_to_decoder_func: return self.id_to_decoder_func[schema_id] # fetch writer schema from schema reg try: writer_schema_obj = self.registry_client.get_by_id(schema_id) except ClientError as e: raise SerializerError("unable to fetch schema with id %d: %s" % (schema_id, str(e))) if writer_schema_obj is None: raise SerializerError("unable to fetch schema with id %d" % (schema_id)) curr_pos = payload.tell() reader_schema_obj = (self.reader_key_schema if is_key else self.reader_value_schema) if HAS_FAST: # try to use fast avro try: fast_avro_writer_schema = parse_schema( writer_schema_obj.to_json()) fast_avro_reader_schema = parse_schema( reader_schema_obj.to_json()) schemaless_reader(payload, fast_avro_writer_schema) # If we reach this point, this means we have fastavro and it can # do this deserialization. Rewind since this method just determines # the reader function and we need to deserialize again along the # normal path. payload.seek(curr_pos) self.id_to_decoder_func[ schema_id] = lambda p: schemaless_reader( p, fast_avro_writer_schema, fast_avro_reader_schema) return self.id_to_decoder_func[schema_id] except Exception: # Fast avro failed, fall thru to standard avro below. pass # here means we should just delegate to slow avro # rewind payload.seek(curr_pos) # Avro DatumReader py2/py3 inconsistency, hence no param keywords # should be revisited later # https://github.com/apache/avro/blob/master/lang/py3/avro/io.py#L459 # https://github.com/apache/avro/blob/master/lang/py/src/avro/io.py#L423 # def __init__(self, writers_schema=None, readers_schema=None) # def __init__(self, writer_schema=None, reader_schema=None) avro_reader = avro.io.DatumReader(writer_schema_obj, reader_schema_obj) def decoder(p): bin_decoder = avro.io.BinaryDecoder(p) return avro_reader.read(bin_decoder) self.id_to_decoder_func[schema_id] = decoder return self.id_to_decoder_func[schema_id]
def produce(self, **kwargs): """ Sends message to kafka by encoding with specified avro schema @:param: topic: topic name @:param: value: A dictionary object @:param: value_schema : Avro schema for value @:param: key: A dictionary object @:param: key_schema : Avro schema for key @:exception: SerializerError """ # get schemas from kwargs if defined key_schema = kwargs.pop('key_schema', self._key_schema) value_schema = kwargs.pop('value_schema', self._value_schema) topic = kwargs.pop('topic', None) if not topic: raise ClientError("Topic name not specified.") value = kwargs.pop('value', None) key = kwargs.pop('key', None) if value: if value_schema: value = self._serializer.encode_record_with_schema(topic, value_schema, value) else: raise SerializerError("Avro schema required for value") if key: if key_schema: key = self._serializer.encode_record_with_schema(topic, key_schema, key, True) else: raise SerializerError("Avro schema required for key") super(AvroProducer, self).produce(topic, value, key, **kwargs)
def _get_decoder_func(self, schema_id, is_key): if schema_id in self.id_to_decoder_func: return self.id_to_decoder_func[schema_id] # fetch from schema reg try: schema = self.registry_client.get_by_id(schema_id) except ClientError as e: raise SerializerError("unable to fetch schema with id %d: %s" % (schema_id, str(e))) if schema is None: raise SerializerError("unable to fetch schema with id %d" % schema_id) if sys.version_info[0] < 3: avro_reader = avro.io.DatumReader(readers_schema=self.key_schema if is_key else self.value_schema, writers_schema=schema) else: avro_reader = avro.io.DatumReader( reader_schema=self.key_schema if is_key else self.value_schema, writer_schema=schema) def decoder(p): bin_decoder = avro.io.BinaryDecoder(p) return avro_reader.read(bin_decoder) self.id_to_decoder_func[schema_id] = decoder return self.id_to_decoder_func[schema_id]
def _load_schema(self, schema_id): # fetch from schema reg try: schema = self.registry_client.get_by_id(schema_id) except ClientError as e: raise SerializerError("unable to fetch schema with id %d: %s" % (schema_id, str(e))) if schema is None: raise SerializerError("unable to fetch schema with id %d" % (schema_id)) return schema
def decode(self, bytes): if len(bytes) <= 5: raise SerializerError("Message is too small to decode") with ContextStringIO(bytes) as payload: magic, schema_id = struct.unpack(">bI", payload.read(5)) if magic != MAGIC_BYTE: raise SerializerError("message does not start with magic byte") curr_pos = payload.tell() avro_reader = avro.io.DatumReader(self.schema) def decoder(p): bin_decoder = avro.io.BinaryDecoder(p) return avro_reader.read(bin_decoder) return decoder(payload)
def decode_message(self, message): """ Decode a message from kafka that has been encoded for use with the schema registry. @:param: message """ if len(message) <= 5: raise SerializerError("message is too small to decode") with ContextStringIO(message) as payload: magic, schema_id = struct.unpack('>bI', payload.read(5)) if magic != MAGIC_BYTE: raise SerializerError("message does not start with magic byte") decoder_func = self._get_decoder_func(schema_id, payload) return decoder_func(payload)
def poll(self, timeout=None): """ This is an overriden method from confluent_kafka.Consumer class. This handles message deserialization using avro schema :param float timeout: Poll timeout in seconds (default: indefinite) :returns: message object with deserialized key and value as dict objects :rtype: Message """ if timeout is None: timeout = -1 message = super(AvroConsumer, self).poll(timeout) if message is None: return None if not message.error(): try: if message.value() is not None: decoded_value = self._serializer.decode_message( message.value(), is_key=False) message.set_value(decoded_value) if message.key() is not None: decoded_key = self._serializer.decode_message( message.key(), is_key=True) message.set_key(decoded_key) except SerializerError as e: raise SerializerError( "Message deserialization failed for message at {} [{}] offset {}: {}" .format(message.topic(), message.partition(), message.offset(), e)) return message
def _get_decoder_func(self, schema_id, payload): if schema_id in self.id_to_decoder_func: return self.id_to_decoder_func[schema_id] # fetch from schema reg try: schema = self.registry_client.get_by_id(schema_id) except ClientError as e: raise SerializerError("unable to fetch schema with id %d: %s" % (schema_id, str(e))) if schema is None: raise SerializerError("unable to fetch schema with id %d" % (schema_id)) curr_pos = payload.tell() if HAS_FAST: # try to use fast avro try: schema_dict = schema.to_json() read_data(payload, schema_dict) # If we reach this point, this means we have fastavro and it can # do this deserialization. Rewind since this method just determines # the reader function and we need to deserialize again along the # normal path. payload.seek(curr_pos) self.id_to_decoder_func[schema_id] = lambda p: read_data( p, schema_dict) return self.id_to_decoder_func[schema_id] except Exception: # Fast avro failed, fall thru to standard avro below. pass # here means we should just delegate to slow avro # rewind payload.seek(curr_pos) avro_reader = avro.io.DatumReader(schema) def decoder(p): bin_decoder = avro.io.BinaryDecoder(p) return avro_reader.read(bin_decoder) self.id_to_decoder_func[schema_id] = decoder return self.id_to_decoder_func[schema_id]
def produce(self, **kwargs): ''' Sends message to kafka by encoding with specified avro schema @:param: topic: topic name @:param: value: A dictionary object @:param: value_schema : Avro schema for value @:param: key: A dictionary object @:param: key_schema : Avro schema for key @:exception: SerializerError ''' # get schemas from kwargs if defined key_schema = kwargs.pop('key_schema', None) value_schema = kwargs.pop('value_schema', None) topic = kwargs.pop('topic', None) if not topic: log.error("Topic name not specified.") raise ClientError("Topic name not specified.") value = kwargs.pop('value', None) key = kwargs.pop('key', None) # if key_schema is not initialized, fall back on default key_schema passed as construction param. if not key_schema: key_schema = self.key_schema # if value_schema is not initialized, fall back on default value_schema passed as construction param. if not value_schema: value_schema = self.value_schema if value: if value_schema: value = self._serializer.encode_record_with_schema( topic, value_schema, value) else: log.error("Schema required for value serialization") raise SerializerError("Avro schema required for value") if key: if key_schema: key = self._serializer.encode_record_with_schema( topic, key_schema, key, True) else: log.error("Schema required for key serialization") raise SerializerError("Avro schema required for key") self._producer.produce(topic, value, key, **kwargs)
def decode_message(self, message, is_key=False): """ Decode a message from kafka that has been encoded for use with the schema registry. :param str|bytes or None message: message key or value to be decoded :returns: Decoded message contents. :rtype dict: """ if message is None: return None if len(message) <= 5: raise SerializerError("message is too small to decode") with ContextStringIO(message) as payload: magic, schema_id = struct.unpack('>bI', payload.read(5)) if magic != MAGIC_BYTE: raise SerializerError("message does not start with magic byte") decoder_func = self._get_decoder_func(schema_id, payload, is_key) return decoder_func(payload)
def deserialize(self, message): # Apply any decoding, like from an AvroConsumer if hasattr(self, '_serializer') and self._serializer: try: if message.value() is not None: decoded_value = self._serializer.decode_message( message.value(), is_key=False) message.set_value(decoded_value) if message.key() is not None: decoded_key = self._serializer.decode_message( message.key(), is_key=True) message.set_key(decoded_key) except SerializerError as e: raise SerializerError( "Message de-serialization failed for message at {} [{}] offset {}: {}" .format(message.topic(), message.partition(), message.offset(), e)) return message
def _get_decoder_func(self, schema_id, payload): if schema_id in self.id_to_decoder_func: return self.id_to_decoder_func[schema_id] # fetch from schema reg try: schema = self.registry_client.get_by_id(schema_id) except: schema = None if not schema: err = "unable to fetch schema with id %d" % (schema_id) raise SerializerError(err) curr_pos = payload.tell() if HAS_FAST: # try to use fast avro try: schema_dict = schema.to_json() obj = read_data(payload, schema_dict) # here means we passed so this is something fastavro can do # seek back since it will be called again for the # same payload - one time hit payload.seek(curr_pos) decoder_func = lambda p: read_data(p, schema_dict) self.id_to_decoder_func[schema_id] = decoder_func return self.id_to_decoder_func[schema_id] except: pass # here means we should just delegate to slow avro # rewind payload.seek(curr_pos) avro_reader = avro.io.DatumReader(schema) def decoder(p): bin_decoder = avro.io.BinaryDecoder(p) return avro_reader.read(bin_decoder) self.id_to_decoder_func[schema_id] = decoder return self.id_to_decoder_func[schema_id]
def poll( self, group_id, timeout=1, max_records=1, poll_attempts=10, only_value=True, auto_create_topics=True, decode_format=None, fail_on_deserialization=False ): """Fetch and return messages from assigned topics / partitions as list. - ``timeout`` (int): Seconds spent waiting in poll if data is not available in the buffer.\n - ``max_records`` (int): maximum number of messages to get from poll. Default: 1. If 0, returns immediately with any records that are available currently in the buffer, else returns empty. Must not be negative. Default: `1` - ``poll_attempts`` (int): Attempts to consume messages and endless looping prevention. Sometimes the first messages are None or the topic could be empty. Default: `10`. - ``only_value`` (bool): Return only message.value(). Default: `True`. - ``decode_format`` (str) - If you need to decode data to specific format (See https://docs.python.org/3/library/codecs.html#standard-encodings). Default: None. - ``auto_create_topics`` (bool): Consumers no longer trigger auto creation of topics, will be removed in future release. If True then the error message UNKNOWN_TOPIC_OR_PART is ignored. Default: `True`. - ``fail_on_deserialization`` (bool): If True and message deserialization fails, will raise a SerializerError exception; on False will just stop the current poll and return the message so far. Default: `False`. """ messages = [] while poll_attempts > 0: msg = None try: msg = self.consumers[group_id].poll(timeout=timeout) except SerializerError as err: error = 'Message deserialization failed for {}: {}'.format(msg, err) if fail_on_deserialization: raise SerializerError(error) print(error) break if msg is None: poll_attempts -= 1 continue if msg.error(): # Workaround due to new message return + deprecation of the "Consumers no longer trigger auto creation of topics" if int(msg.error().code()) == KafkaError.UNKNOWN_TOPIC_OR_PART and auto_create_topics: continue raise KafkaException(msg.error()) if only_value: messages.append(msg.value()) else: messages.append(msg) if len(messages) == max_records: break if decode_format: messages = self._decode_data(data=messages, decode_format=decode_format) return messages
def test_message(self): try: raise SerializerError("message") except SerializerError as e: assert e.message == 'message'