class Meta: def __init__(self, callback, service_name, param_schema, result_schema, version=0): self.callback = callback self.service_name = service_name self.param_schema = SchemaFromJSONData(param_schema, Names()) self.result_schema = SchemaFromJSONData(result_schema, Names()) self.version = version self._param_writer = DatumWriter(self.param_schema) self._param_reader = DatumReader(self.param_schema) self._result_writer = DatumWriter(self.result_schema) self._result_reader = DatumReader(self.result_schema) def decode_param(self, byte_mem): return self._param_reader.read(BinaryDecoder(BytesIO(byte_mem))) def encode_param(self, param): logger.info(param) io = BytesIO() self._param_writer.write(param, BinaryEncoder(io)) return io.getbuffer().tobytes() def decode_result(self, byte_mem): return self._result_reader.read(BinaryDecoder(BytesIO(byte_mem))) def encode_result(self, result): io = BytesIO() self._result_writer.write(result, BinaryEncoder(io)) return io.getbuffer().tobytes()
def output(self, dataframes: [DataFrame]): """ Emits the transformed data from the filter to a log :param dataframes: The dataframes to log """ syslog_reader = DatumReader(SYSLOG_AVRO_SCHEMA, SYSLOG_AVRO_SCHEMA) for dataframe in dataframes: for row in dataframe.collect(): buffer = BytesIO() buffer.write(row.value) buffer.flush() try: buffer.seek(0) decoder = BinaryDecoder(buffer) output = syslog_reader.read(decoder) self.log(INFO, f"Received AVRO syslog -> { output }") #self.log(INFO, f"Received AVRO syslog -> { row.value }") except BaseException as e: try: buffer.seek(0) decoder = BinaryDecoder(buffer) output = syslog_reader.read(decoder) self.log(INFO, f"ERROR Received AVRO feedback -> {output}") except BaseException as e: self.log(ERROR, f"{e}")
def test_sanity(): """ Ensures that our "base" and "good" schemas are actually forwards- and backwards-compatible """ # fst schema / record fst_schema = schema.parse(open("%s/MyRecord.base.avsc" % BASE_DIR).read()) fst_writer = DatumWriter(writers_schema=fst_schema) fst_record = { "fieldWithoutDefaultValue": 0, "properField": 0, "enumField": "A", "unionField": None, "arrayField": ["world"], "mapField": {"hello": "world"}, "fixedField": "aaaaaaaaaaaaaaaa" } # sec schema / record sec_schema = schema.parse(open("%s/MyRecord.good.avsc" % BASE_DIR).read()) sec_writer = DatumWriter(writers_schema=sec_schema) sec_record = { "fieldWithoutDefaultValue": 0, "properField2": 0, "enumField": "B", "unionField": None, "arrayField": ["world"], "fixedField": "bbbbbbbbbbbbbbbb" } # Encode record w/ fst fst_buf = StringIO.StringIO() fst_encoder = BinaryEncoder(fst_buf) fst_writer.write(fst_record, fst_encoder) fst_data = fst_buf.getvalue() # Encode record w/ sec sec_buf = StringIO.StringIO() sec_encoder = BinaryEncoder(sec_buf) sec_writer.write(sec_record, sec_encoder) sec_data = sec_buf.getvalue() # writers == fst, readers == sec sec_reader = DatumReader(writers_schema=fst_schema, readers_schema=sec_schema) sec_decoder = BinaryDecoder(StringIO.StringIO(fst_data)) sec_from_fst = sec_reader.read(sec_decoder) # no exception -> good # writers == sec, readers == fst fst_reader = DatumReader(writers_schema=sec_schema, readers_schema=fst_schema) fst_decoder = BinaryDecoder(StringIO.StringIO(sec_data)) fst_from_sec = fst_reader.read(fst_decoder) # no exception -> good
def verify_kafka_avro_messages(record, key_capture_mode, key_capture_field, key_capture_attribute, key_schema, message_keys, message_values): for key, value in message_values.items(): assert record.get_field_data(f'/{key}').value == value # validate message key fields/attribute based on configuration if (key_capture_mode in ['RECORD_FIELD', 'RECORD_HEADER_AND_FIELD']): # the message key should have been captured into the configured field for key, value in message_keys.items(): assert record.get_field_data( f"{key_capture_field}/{key}").value == value if (key_capture_mode in ['RECORD_HEADER', 'RECORD_HEADER_AND_FIELD']): # get the base64 encoded Avro message key encoded = record.header['values'][key_capture_attribute] # decode into bytes key_bytes = base64.standard_b64decode(encoded) # create an Avro binary decoder based on those bytes decoder = BinaryDecoder(io.BytesIO(key_bytes)) # parse the key schema out of the record header decoded_key_schema = confluent_kafka.avro.loads( record.header['values']['avroKeySchema']) # ensure the parsed key schema matches the one we actually produced, earlier assert decoded_key_schema == key_schema # create a DatumReader to read a full Avro record (the key) reader = DatumReader(decoded_key_schema) decoded_avro_key = reader.read(decoder) # assert the values from the Avro record match what's expected for key, value in message_keys.items(): assert decoded_avro_key[key] == value
class Deserializer(object): def __init__(self, schema_str): schema = parse(schema_str) self.reader = DatumReader(schema) def deserialize(self, rec_bytes): return self.reader.read(BinaryDecoder(StringIO(rec_bytes)))
def deserialize( self, data, # type: Union[bytes, BinaryIO] schema, # type: Union[str, bytes, avro.schema.Schema] ): # type: (Union[bytes, BinaryIO], Union[str, bytes, avro.schema.Schema]) -> ObjectType """Read the binary representation into a specific type. Return type will be ignored, since the schema is deduced from the provided bytes. :param data: A stream of bytes or bytes directly :type data: BinaryIO or bytes :param schema: An Avro RecordSchema :type schema: Union[str, bytes, avro.schema.Schema] :returns: An instantiated object :rtype: ObjectType """ if not hasattr(data, 'read'): data = BytesIO(data) if not isinstance(schema, avro.schema.Schema): schema = avro.schema.parse(schema) try: reader = self._schema_reader_cache[str(schema)] except KeyError: reader = DatumReader(writers_schema=schema) self._schema_reader_cache[str(schema)] = reader with data: bin_decoder = BinaryDecoder(data) decoded_data = reader.read(bin_decoder) return decoded_data
def consume(self): for message in self.kfkcon: try: if self.ser_type == kfkcfg.SERIALIZATIO_JSON: message = json.loads(message.value.decode('utf-8')) elif self.ser_type == kfkcfg.SERIALIZATIO_AVRO: bytes_reader = io.BytesIO(message.value) decoder = BinaryDecoder(bytes_reader) reader = DatumReader(self.avro_schema) try: message = reader.read(decoder) print(message) except Exception as e: print(e) pass if self.print_msg: parser_logger.info( 'Message to consume: {} -- serialization: {}'.format( message, self.ser_type)) # print('Message to consume: {}'.format(message)) yield message except Exception as e: parser_logger.info( 'unable to parse the msg!: {}...error: {}'.format( message, e))
def _decode_avro(raw_bytes): schema = SchemaFromJSONData(avro_api_schema) buffer = io.BytesIO(raw_bytes) decoder = BinaryDecoder(buffer) reader = DatumReader(schema) content = reader.read(decoder) return content
def run(self, n): # JSON Serializer # serializer = ajs.AvroJsonSerializer(self.movies_schema) # json_data = serializer.to_json(self.movies_data) total_ser = 0 total_deser = 0 bytes_len = 0 for i in range(0, n): datum_writer = DatumWriter(self.movies_schema) bytes_writer = io.BytesIO() encoder = BinaryEncoder(bytes_writer) tic = timeit.default_timer() datum_writer.write(self.movies_data, encoder) elapsed = timeit.default_timer() - tic payload = bytes_writer.getvalue() total_ser = total_ser + elapsed bytes_len = len(payload) bytes_reader = io.BytesIO(payload) decoder = BinaryDecoder(bytes_reader) reader = DatumReader(self.movies_schema) tic2 = timeit.default_timer() movies = reader.read(decoder) elapsed2 = timeit.default_timer() - tic2 total_deser = total_deser + elapsed2 self.logger.log(logging.INFO, "serialized len: %s bytes", bytes_len) avg_ser = (total_ser*(10**9))/n avg_deser = (total_deser*(10**9))/n self.logger.log(logging.INFO, "Serialization time: \n%s", avg_ser) self.logger.log(logging.INFO, "De-serialization time: \n%s", avg_deser)
def consume_records(topic): consumer = KafkaConsumer(bootstrap_servers=[broker_config], auto_offset_reset='earliest', #read all, default is latest enable_auto_commit= True, # group_id='test-consumer-group', #to avoid message be consumed more than once consumer_timeout_ms= 120 ) consumer.subscribe([topic]) for _msg in consumer: bytes_reader = io.BytesIO(_msg.value) decoder = avro.io.BinaryDecoder(bytes_reader) reader = DatumReader(schema) val = reader.read(decoder) test_val=list(val.values()) logging.info(test_val) #connect to RDBMS conn = pymysql.connect(host, user, passwd, db) cursor =conn.cursor() cursor.execute ("""CREATE TABLE IF NOT EXISTS kafkav1 (id int auto_increment primary key, \ testing text, favorite_color text, favorite_number int)""") cursor.execute ("""INSERT INTO kafkav1(testing,favorite_number,favorite_color) values \ ('%s', %s, '%s')"""% (test_val[0],test_val[1],test_val[2])) conn.commit() conn.close() logging.info('send test data to mysql sink')
class Deserializer(object): def __init__(self, schema_str): schema = avro.schema.parse(schema_str) self.reader = DatumReader(schema) def deserialize(self, rec_bytes): return self.reader.read(BinaryDecoder(StringIO(rec_bytes)))
def decode(schema, msg_value): reader = DatumReader(schema) message_bytes = io.BytesIO(msg_value) message_bytes.seek(5) decoder = BinaryDecoder(message_bytes) event_dict = reader.read(decoder) return event_dict
def _deserialize_message(content, schema_path: Path) -> ByteString: schema = avro.schema.parse(schema_path.read_text()) bytes_reader = io.BytesIO(content) reader = DatumReader(schema) decoder = avro.io.BinaryDecoder(bytes_reader) return reader.read(decoder)
def read(self): self.consumer.subscribe([self.topic]) for msg in self.consumer: bytes_reader = io.BytesIO(msg.value) decoder = avro.io.BinaryDecoder(bytes_reader) reader = DatumReader(self.schema) value = reader.read(decoder) yield value
def avro_deserialization(value_schema,x): datum_reader = DatumReader(writer_schema=value_schema, reader_schema=value_schema) bytes_io = BytesIO(x) decoder = BinaryDecoder(bytes_io) deserialized_x = datum_reader.read(decoder) bytes_io.close() return deserialized_x
def binToObj(ab): bytes_reader = io.BytesIO(ab) decoder = BinaryDecoder(bytes_reader) reader = DatumReader(sc) while True: try: rec = reader.read(decoder) print(rec) except: break
class Deserializer(object): def __init__(self, schema_str): if sys.version_info >= (3, ): schema = avro.schema.Parse(schema_str) else: schema = avro.schema.parse(schema_str) self.reader = DatumReader(schema) def deserialize(self, rec_bytes): return self.reader.read(BinaryDecoder(string_io(rec_bytes)))
def unpack(self, payload): magic, schema_id = struct.unpack('>bi', payload[:5]) if magic == MAGIC_BYTES: schema = self.RegistryClient.get_by_id(schema_id) reader = DatumReader(schema) output = BinaryDecoder(io.BytesIO(payload[5:])) abc = reader.read(output) return abc else: return payload.decode()
def deserialize(flight_info_bytes): if flight_info_bytes is not None: bytes_reader = BytesIO(flight_info_bytes) decoder = BinaryDecoder(bytes_reader) schema_flight_info = Parse( open(dir_path + "/flight-info.schema.avsc", "rb").read()) reader = DatumReader(schema_flight_info) flight_info = reader.read(decoder) return [{"id": "1"}, {"id": "2"}] else: return None
class Deserializer(object): def __init__(self, schema_str): if sys.version_info >= (3,): schema = avro.schema.Parse(schema_str) else: schema = avro.schema.parse(schema_str) self.reader = DatumReader(schema) def deserialize(self, rec_bytes): return self.reader.read(BinaryDecoder(string_io(rec_bytes)))
def read_value(schema: TypedSchema, bio: io.BytesIO): if schema.schema_type is SchemaType.AVRO: reader = DatumReader(schema.schema) return reader.read(BinaryDecoder(bio)) if schema.schema_type is SchemaType.JSONSCHEMA: value = load(bio) try: schema.schema.validate(value) except ValidationError as e: raise InvalidPayload from e return value raise ValueError("Unknown schema type")
def deserialize_avro(binary_data, schema): """ Function used to deserialize an avro binary data :param schema: avro schema of binary data :param binary_data: event data in binary encoded (bytes) :return: deserialized data and corresponding schema """ bytes_reader = io.BytesIO(binary_data) decoder = BinaryDecoder(bytes_reader) reader = DatumReader(schema) data = reader.read(decoder) return data, schema
def deserialize(flight_info_bytes): if flight_info_bytes is not None: bytes_reader = BytesIO(flight_info_bytes) decoder = BinaryDecoder(bytes_reader) schema_flight_info = Parse( open(dir_path + "/flight-info.schema.avsc", "rb").read()) reader = DatumReader(schema_flight_info) flight_info = reader.read(decoder) return json.dumps([{"id": 907955534287978496}]) else: return None
def decode_avro(self, payload): magic, schema_id = struct.unpack('>bi', payload[:5]) # Get Schema registry # Avro value format if magic == MAGIC_BYTES: schema = self.register_client.get_by_id(schema_id) reader = DatumReader(schema) output = BinaryDecoder(io.BytesIO(payload[5:])) decoded = reader.read(output) return decoded, schema.name # no magic bytes, something is wrong else: raise ValueError
def parse_avro_msg(msg, avro_schema): """ Parses an avro record using a specified avro schema Args: :msg: the avro message to parse :avro_schema: the avro schema Returns: The parsed/decoded message """ reader = DatumReader(avro_schema) message_bytes = BytesIO(msg) decoder = BinaryDecoder(message_bytes) return reader.read(decoder)
class AvroContext(pp.TaskContext): datum_reader = None # FIXME not strictly necessary def set_job_conf(self, vals): super(AvroContext, self).set_job_conf(vals) schema = avro.schema.parse(self._job_conf[AVRO_SCHEMA_KEY]) self.datum_reader = DatumReader(schema) def get_input_value(self): # FIXME reuse, reuse, reuse sys.stderr.write('value: %r\n' % self._value) f = StringIO(self._value) dec = BinaryDecoder(f) return self.datum_reader.read(dec)
def _decode_avro(self, avro_data): """ Decode Avro Message. Args: avro_data (binary): Avro data to decode. Returns: dict """ bytes_reader = io.BytesIO(avro_data.value) decoder = BinaryDecoder(bytes_reader) reader = DatumReader(self.schema) return reader.read(decoder)
def unpack(self, payload): MAGIC_BYTES = 0 magic, schema_id = struct.unpack('>bi', payload[:5]) # Get Schema registry # Avro value format if magic == MAGIC_BYTES: schema = self.register_client.get_by_id(schema_id) reader = DatumReader(schema) output = BinaryDecoder(io.BytesIO(payload[5:])) abc = reader.read(output) return abc # String key else: # Timestamp is inside my key return payload[:-8].decode()
def _unpack(self, payload): magic, schema_id = struct.unpack('>bi', payload[:5]) # Get Schema registry # Avro value format if magic == self.MAGIC_BYTES: schema = self.register_client.get_by_id(schema_id) reader = DatumReader(schema) output = BinaryDecoder(io.BytesIO(payload[5:])) abc = reader.read(output) return abc # String key else: # If KSQL payload, exclude timestamp which is inside the key. # payload[:-8].decode() return payload.decode()
def callback(message): # Get the message serialization type. encoding = message.attributes.get("googclient_schemaencoding") # Deserialize the message data accordingly. if encoding == "BINARY": bout = io.BytesIO(message.data) decoder = BinaryDecoder(bout) reader = DatumReader(avro_schema) message_data = reader.read(decoder) print(f"Received a binary-encoded message:\n{message_data}") elif encoding == "JSON": message_data = json.loads(message.data) print(f"Received a JSON-encoded message:\n{message_data}") else: print(f"Received a message with no encoding:\n{message}") message.ack()
def unpack(payload, windowed, register_client): magic, schema_id = struct.unpack('>bi', payload[:5]) # Get Schema registry # Avro value format if magic == MAGIC_BYTES: schema = register_client.get_by_id(schema_id) reader = DatumReader(schema) output = BinaryDecoder(io.BytesIO(payload[5:])) abc = reader.read(output) return abc # String key else: # If Windowed KSQL payload if windowed == 'TRUE': return payload[:-8].decode() elif windowed == 'FALSE': return payload.decode()
class AvroCodec(object): def __init__(self, schema): self._raw_schema = schema self._avro_schema = avro.schema.parse(json.dumps(schema)) self._reader = DatumReader(self._avro_schema) def dump(self, obj, fp): """ Serializes obj as an avro-format byte stream to the provided fp file-like object stream. """ if not validate(obj, self._raw_schema): raise AvroTypeException(self._avro_schema, obj) fastavro_write_data(fp, obj, self._raw_schema) def dumps(self, obj): """ Serializes obj to an avro-format byte array and returns it. """ out = BytesIO() try: self.dump(obj, out) return out.getvalue() finally: out.close() def load(self, fp): """ Deserializes the byte stream contents of the given file-like object into an object and returns it. """ return self._reader.read(BinaryDecoder(fp)) def loads(self, data): """ Deserializes the given byte array into an object and returns it. """ st = BytesIO(data) try: return self.load(st) finally: st.close()
def extract_messages(ams, ingest_sub, bulk_size, schema, verify): # consume metric data messages consumed_msgs = ams.pull_sub(ingest_sub, num=bulk_size, return_immediately=True, verify=verify) # initialise the avro reader avro_reader = DatumReader(writers_schema=schema) # all the decoded messages that will be returned decoded_msgs = [] # decode the messages for msg in consumed_msgs: try: # decode the data field again using the provided avro schema msg_bytes = BytesIO(msg[1].get_data()) msg_decoder = BinaryDecoder(msg_bytes) avro_msg = avro_reader.read(msg_decoder) # check that the tags field is present if avro_msg["tags"] is None: raise KeyError("tags field is empty") # append to decoded messages decoded_msgs.append((msg[0], avro_msg)) except Exception as e: LOGGER.warning( "Could not extract data from ams message {}, {}".format( msg[0], e.message)) last_msg_id = "-1" if len(consumed_msgs) > 0: last_msg_id = consumed_msgs.pop()[0] return decoded_msgs, last_msg_id
from itertools import repeat from time import time from avro.datafile import DataFileReader, DataFileWriter from avro.io import DatumReader, DatumWriter, BinaryEncoder, BinaryDecoder import sys LOOPS = 1 with open(sys.argv[1]) as reader: datum_reader = DatumReader() file_reader = DataFileReader(reader, datum_reader) SCHEMA = datum_reader.writers_schema BUFS = [] datum_writer = DatumWriter(SCHEMA) for record in file_reader: buf = BytesIO() encoder = BinaryEncoder(buf) datum_writer.write(record, encoder) BUFS.append(buf) datum_reader = DatumReader(SCHEMA) start = time() n = 0 for _ in repeat(None, LOOPS): for buf in BUFS: n += 1 buf.seek(0) record = datum_reader.read(BinaryDecoder(buf)) print 1000. * (time() - start) / n
def fromKey(self, key, avroType): bytes = io.BytesIO(base64.b64decode(key)) reader = DatumReader(avroType.schema) return reader.read(BinaryDecoder(bytes))
from itertools import repeat from time import time from avro.datafile import DataFileReader, DataFileWriter from avro.io import DatumReader, DatumWriter, BinaryEncoder, BinaryDecoder import sys LOOPS = 1 with open(sys.argv[1]) as reader: datum_reader = DatumReader() file_reader = DataFileReader(reader, datum_reader) SCHEMA = datum_reader.writers_schema BUFS = [] datum_writer = DatumWriter(SCHEMA) for record in file_reader: buf = BytesIO() encoder = BinaryEncoder(buf) datum_writer.write(record, encoder) BUFS.append(buf) datum_reader = DatumReader(SCHEMA) start = time() n = 0 for _ in repeat(None, LOOPS): for buf in BUFS: n += 1 buf.seek(0) record = datum_reader.read(BinaryDecoder(buf)) print 1000. * (time() - start) / n