def main(args): log = logging.getLogger(__name__) log.setLevel(logging.INFO) sys_log = logging.handlers.SysLogHandler("/dev/log") sys_format = logging.Formatter('%(name)s[%(process)d]: %(levelname)s %(message)s') sys_log.setFormatter(sys_format) log.addHandler(sys_log) reader = DataFileReader(open(args.avro_file, "r"), DatumReader()) schema = reader.datum_reader.writers_schema for i, row in enumerate(reader): log.debug("Consumer row:" + str(row)) writer = DatumWriter(schema) bytes_writer = io.BytesIO() encoder = BinaryEncoder(bytes_writer) writer.write(row, encoder) raw_bytes = bytes_writer.getvalue() b64enc = base64.b64encode(raw_bytes) msg = {"messages": [{"data": b64enc}]} json_str = json.dumps(msg) log.debug("json msg:" + json_str) publish(json_str, args.ams_endpoint, args.ams_project, args.ams_topic, args.ams_key, log)
def send_avro_record_to_kafka(topic, value, bootstrap_servers, avro_schema_json): value_schema = avro.schema.parse(avro_schema_json) producer_config = { "bootstrap.servers": bootstrap_servers, "request.timeout.ms": "1000", } producer = Producer(producer_config) writer = DatumWriter(value_schema) bytes_writer = io.BytesIO() encoder = BinaryEncoder(bytes_writer) writer.write(value, encoder) try: producer.produce(topic=topic, value=bytes_writer.getvalue()) except Exception as e: print( f"Exception while producing record value - {value} to topic - {topic}: {e}" ) else: print(f"Successfully producing record value - {value} to topic - {topic}") producer.flush()
def toKey(self, x, avroType): x = jsonEncoder(avroType, x, False) bytes = io.BytesIO() writer = DatumWriter(avroType.schema) writer.write(x, BinaryEncoder(bytes)) bytes.flush() return base64.b64encode(bytes.getvalue())
class Meta: def __init__(self, callback, service_name, param_schema, result_schema, version=0): self.callback = callback self.service_name = service_name self.param_schema = SchemaFromJSONData(param_schema, Names()) self.result_schema = SchemaFromJSONData(result_schema, Names()) self.version = version self._param_writer = DatumWriter(self.param_schema) self._param_reader = DatumReader(self.param_schema) self._result_writer = DatumWriter(self.result_schema) self._result_reader = DatumReader(self.result_schema) def decode_param(self, byte_mem): return self._param_reader.read(BinaryDecoder(BytesIO(byte_mem))) def encode_param(self, param): logger.info(param) io = BytesIO() self._param_writer.write(param, BinaryEncoder(io)) return io.getbuffer().tobytes() def decode_result(self, byte_mem): return self._result_reader.read(BinaryDecoder(BytesIO(byte_mem))) def encode_result(self, result): io = BytesIO() self._result_writer.write(result, BinaryEncoder(io)) return io.getbuffer().tobytes()
def _avro_serialize(msg): avro_writer = DatumWriter(self.shared.topic['schema']) bytesio = BytesIO() encoder = BinaryEncoder(bytesio) avro_writer.write(msg, encoder) return bytesio.getvalue()
def main(args): log = logging.getLogger(__name__) log.setLevel(logging.INFO) sys_log = logging.handlers.SysLogHandler("/dev/log") sys_format = logging.Formatter( '%(name)s[%(process)d]: %(levelname)s %(message)s') sys_log.setFormatter(sys_format) log.addHandler(sys_log) reader = DataFileReader(open(args.avro_file, "r"), DatumReader()) schema = reader.datum_reader.writers_schema for i, row in enumerate(reader): log.debug("Consumer row:" + str(row)) writer = DatumWriter(schema) bytes_writer = io.BytesIO() encoder = BinaryEncoder(bytes_writer) writer.write(row, encoder) raw_bytes = bytes_writer.getvalue() b64enc = base64.b64encode(raw_bytes) msg = {"messages": [{"data": b64enc}]} json_str = json.dumps(msg) log.debug("json msg:" + json_str) publish(json_str, args.ams_endpoint, args.ams_project, args.ams_topic, args.ams_key, log)
def serialize(thing): writer = DatumWriter(SCHEMA) buf = StringIO() writer.write(thing, BinaryEncoder(buf)) v = buf.getvalue() buf.close() return v
def serialize( self, data, # type: ObjectType schema, # type: Union[str, bytes, avro.schema.Schema] ): # type: (ObjectType, Union[str, bytes, avro.schema.Schema]) -> bytes """Convert the provided value to it's binary representation and write it to the stream. Schema must be a Avro RecordSchema: https://avro.apache.org/docs/1.10.0/gettingstartedpython.html#Defining+a+schema :param data: An object to serialize :type data: ObjectType :param schema: An Avro RecordSchema :type schema: Union[str, bytes, avro.schema.Schema] :returns: Encoded bytes :rtype: bytes """ if not schema: raise ValueError("Schema is required in Avro serializer.") if not isinstance(schema, avro.schema.Schema): schema = avro.schema.parse(schema) try: writer = self._schema_writer_cache[str(schema)] except KeyError: writer = DatumWriter(schema) self._schema_writer_cache[str(schema)] = writer stream = BytesIO() with stream: writer.write(data, BinaryEncoder(stream)) encoded_data = stream.getvalue() return encoded_data
def encode_avro_message(data): datum_writer = DatumWriter(get_media_avro_schema()) bytes_writer = io.BytesIO() encoder = avro.io.BinaryEncoder(bytes_writer) datum_writer.write(data, encoder) raw_bytes = bytes_writer.getvalue() return raw_bytes
def run(self, n): # JSON Serializer # serializer = ajs.AvroJsonSerializer(self.movies_schema) # json_data = serializer.to_json(self.movies_data) total_ser = 0 total_deser = 0 bytes_len = 0 for i in range(0, n): datum_writer = DatumWriter(self.movies_schema) bytes_writer = io.BytesIO() encoder = BinaryEncoder(bytes_writer) tic = timeit.default_timer() datum_writer.write(self.movies_data, encoder) elapsed = timeit.default_timer() - tic payload = bytes_writer.getvalue() total_ser = total_ser + elapsed bytes_len = len(payload) bytes_reader = io.BytesIO(payload) decoder = BinaryDecoder(bytes_reader) reader = DatumReader(self.movies_schema) tic2 = timeit.default_timer() movies = reader.read(decoder) elapsed2 = timeit.default_timer() - tic2 total_deser = total_deser + elapsed2 self.logger.log(logging.INFO, "serialized len: %s bytes", bytes_len) avg_ser = (total_ser*(10**9))/n avg_deser = (total_deser*(10**9))/n self.logger.log(logging.INFO, "Serialization time: \n%s", avg_ser) self.logger.log(logging.INFO, "De-serialization time: \n%s", avg_deser)
def serialize(myschema, myobject): buf = io.BytesIO() encoder = BinaryEncoder(buf) writer = DatumWriter(writer_schema=myschema) writer.write(myobject, encoder) buf.seek(0) return (buf.read())
def __call__(self, state, scope, pos, paramTypes, x): schema = avro.schema.parse(json.dumps(paramTypes[0])) x = untagUnion(x, paramTypes[0]) bytes = io.BytesIO() writer = DatumWriter(schema) writer.write(x, BinaryEncoder(bytes)) bytes.flush() return bytes.getvalue()
def __call__(self, state, scope, pos, paramTypes, x): schema = avro.schema.Parse(json.dumps(paramTypes[0])) x = untagUnion(x, paramTypes[0]) bytes_io = io.BytesIO() writer = DatumWriter(schema) writer.write(x, BinaryEncoder(bytes_io)) bytes_io.flush() return bytesToString(bytes_io.getvalue())
def to_avro(self): writer = DatumWriter() writer.set_writer_schema(SYSLOG_AVRO_SCHEMA) buffer = BytesIO() encoder = BinaryEncoder(buffer) writer.write(self.to_avro_dict(), encoder) data = buffer.getbuffer().tobytes() return bytearray(data)
def _serialize_message(content, schema_path: Path) -> ByteString: schema = avro.schema.parse(schema_path.read_text()) bytes_writer = io.BytesIO() writer = DatumWriter(schema) encoder = avro.io.BinaryEncoder(bytes_writer) writer.write(content, encoder) return bytes_writer.getvalue()
def objToBin(): bytes_writer = io.BytesIO() encoder = BinaryEncoder(bytes_writer) writer_binary = DatumWriter(sc) for d in datum: writer_binary.write(d, encoder) ab = bytes_writer.getvalue() return ab
def compose_data(timestamp, src_vmtype, host_ip, account_id, dest_ip): writer = DatumWriter(get_schema()) bytes_writer = io.BytesIO() encoder = avro.io.BinaryEncoder(bytes_writer) message = '{"eventName": "Neighbour_Unreachable", "accountId":"%s", "destIp":"%s"}' \ % (account_id, dest_ip) raw_data = bytes(message) writer.write({"timestamp": timestamp, "src": src_vmtype, "host_ip": host_ip, "rawdata":raw_data}, encoder) raw_bytes = bytes_writer.getvalue() return raw_bytes
class AvroSerializer(object): def __init__(self, schema_str): schema = avro.schema.parse(schema_str) self.writer = DatumWriter(schema) def serialize(self, record): f = StringIO() encoder = BinaryEncoder(f) self.writer.write(record, encoder) return f.getvalue()
def avro_view(request): data = DataSource().data buffer = BytesIO() schema = SchemaFromJSONData(avro_api_schema) writer = DatumWriter(schema) encoder = BinaryEncoder(buffer) writer.write(data, encoder) return HttpResponse(buffer.getvalue(), content_type='application/octet-stream')
def compose_data(timestamp, src_vmtype, host_ip, account_id, proc_name): writer = DatumWriter(get_schema()) bytes_writer = io.BytesIO() encoder = avro.io.BinaryEncoder(bytes_writer) message = '{"eventName": "Process_Down", "accountId":"%s", "ProcName":"%s"}' \ % (account_id, proc_name) raw_data = bytes(message) writer.write({"timestamp": timestamp, "src": src_vmtype, "host_ip": host_ip, "rawdata":raw_data}, encoder) raw_bytes = bytes_writer.getvalue() return raw_bytes
def encode(self, data): raw_bytes = None try: writer = DatumWriter(self.schema) bytes_writer = io.BytesIO() encoder = BinaryEncoder(bytes_writer) writer.write(data, encoder) raw_bytes = bytes_writer.getvalue() except: print("Error encoding data", sys.exc_info()) return raw_bytes
class Serializer(object): def __init__(self, schema_str): schema = avro.schema.parse(schema_str) self.writer = DatumWriter(schema) def serialize(self, record): f = StringIO() encoder = BinaryEncoder(f) self.writer.write(record, encoder) return f.getvalue()
class AvroSerializer(object): def __init__(self, schema): self.schema = schema self.datum_writer = DatumWriter(schema) def serialize(self, record): f = StringIO() encoder = BinaryEncoder(f) self.datum_writer.write(record, encoder) return f.getvalue()
def avro_encode(cls, json_data, schema=None): """avro 序列化json数据为二进制 :param json_data: :param schema: :return: """ bio = BytesIO() binary_encoder = BinaryEncoder(bio) dw = DatumWriter(writer_schema=schema or cls.RESPONSE_SCHEMA) dw.write(json_data, binary_encoder) return bio.getvalue()
def encode(self, schema_file, data): raw_bytes = None try: schema = avro.schema.Parse(open(schema_file).read()) writer = DatumWriter(schema) bytes_writer = io.BytesIO() encoder = BinaryEncoder(bytes_writer) writer.write(data, encoder) raw_bytes = bytes_writer.getvalue() except: print("Error encode data", sys.exc_info()) return raw_bytes
def _avro_serialize(msg): opened_schema = load_schema(schema) avro_writer = DatumWriter(opened_schema) bytesio = BytesIO() encoder = BinaryEncoder(bytesio) if isinstance(msg, list): for m in msg: avro_writer.write(m, encoder) else: avro_writer.write(msg, encoder) return bytesio.getvalue()
def serialize_val(val, serializer, schema=None): if serializer == "Avro": writer = DatumWriter(schema) bytes_writer = io.BytesIO() encoder = avro.io.BinaryEncoder(bytes_writer) writer.write(val, encoder) return_val = bytes_writer.getvalue() elif serializer == "JSON": return_val = json.dumps(val) else: return_val = val return return_val
def test_sanity(): """ Ensures that our "base" and "good" schemas are actually forwards- and backwards-compatible """ # fst schema / record fst_schema = schema.parse(open("%s/MyRecord.base.avsc" % BASE_DIR).read()) fst_writer = DatumWriter(writers_schema=fst_schema) fst_record = { "fieldWithoutDefaultValue": 0, "properField": 0, "enumField": "A", "unionField": None, "arrayField": ["world"], "mapField": {"hello": "world"}, "fixedField": "aaaaaaaaaaaaaaaa" } # sec schema / record sec_schema = schema.parse(open("%s/MyRecord.good.avsc" % BASE_DIR).read()) sec_writer = DatumWriter(writers_schema=sec_schema) sec_record = { "fieldWithoutDefaultValue": 0, "properField2": 0, "enumField": "B", "unionField": None, "arrayField": ["world"], "fixedField": "bbbbbbbbbbbbbbbb" } # Encode record w/ fst fst_buf = StringIO.StringIO() fst_encoder = BinaryEncoder(fst_buf) fst_writer.write(fst_record, fst_encoder) fst_data = fst_buf.getvalue() # Encode record w/ sec sec_buf = StringIO.StringIO() sec_encoder = BinaryEncoder(sec_buf) sec_writer.write(sec_record, sec_encoder) sec_data = sec_buf.getvalue() # writers == fst, readers == sec sec_reader = DatumReader(writers_schema=fst_schema, readers_schema=sec_schema) sec_decoder = BinaryDecoder(StringIO.StringIO(fst_data)) sec_from_fst = sec_reader.read(sec_decoder) # no exception -> good # writers == sec, readers == fst fst_reader = DatumReader(writers_schema=sec_schema, readers_schema=fst_schema) fst_decoder = BinaryDecoder(StringIO.StringIO(sec_data)) fst_from_sec = fst_reader.read(fst_decoder) # no exception -> good
def write_value(schema: TypedSchema, bio: io.BytesIO, value: dict): if schema.schema_type is SchemaType.AVRO: writer = DatumWriter(schema.schema) writer.write(value, BinaryEncoder(bio)) elif schema.schema_type is SchemaType.JSONSCHEMA: try: schema.schema.validate(value) except ValidationError as e: raise InvalidPayload from e bio.write(json_encode(value, binary=True)) else: raise ValueError("Unknown schema type")
class Serializer(object): def __init__(self, schema_str): if sys.version_info >= (3, ): schema = avro.schema.Parse(schema_str) else: schema = avro.schema.parse(schema_str) self.writer = DatumWriter(schema) def serialize(self, record): f = string_io() encoder = BinaryEncoder(f) self.writer.write(record, encoder) return f.getvalue()
def produce(self): writer = DatumWriter(self.schema) bytes_writer = io.BytesIO() encoder = avro.io.BinaryEncoder(bytes_writer) for i in range(0, 100): writer.write( { "name": self.names[random.randint(0, 9)], "favorite_color": self.colors[random.randint(0, 4)], "favorite_number": random.randint(0, 10) }, encoder) raw_bytes = bytes_writer.getvalue() self.kafka_producer.send(topic=self.topic_name, value=raw_bytes)
def serialize_avro(payload_str, schema): """ Function used to serialize a json event to binary format based on avro schema :param schema: avro schema of payload :param payload_str: event data in json string format :return: avro serialized binary data and corresponding schema """ payload_json = json.loads(payload_str) writer = DatumWriter(schema) bytes_writer = io.BytesIO() encoder = BinaryEncoder(bytes_writer) writer.write(payload_json, encoder) raw_bytes = bytes_writer.getvalue() return raw_bytes, schema
def __encode(self, data, schema): raw_bytes = None try: writer = DatumWriter(schema) bytes_writer = io.BytesIO() encoder = BinaryEncoder(bytes_writer) writer.write(data, encoder) raw_bytes = bytes_writer.getvalue() except Exception as e: print(f'Error encoding data: {repr(e)}') return raw_bytes
class Serializer(object): def __init__(self, schema_str): if sys.version_info >= (3,): schema = avro.schema.Parse(schema_str) else: schema = avro.schema.parse(schema_str) self.writer = DatumWriter(schema) def serialize(self, record): f = string_io() encoder = BinaryEncoder(f) self.writer.write(record, encoder) return f.getvalue()
def publish_avro_records(project_id, topic_id, avsc_file): """Pulbish a BINARY or JSON encoded message to a topic configured with an Avro schema.""" # [START pubsub_publish_avro_records] from avro.io import BinaryEncoder, DatumWriter import avro import io import json from google.api_core.exceptions import NotFound from google.cloud.pubsub import PublisherClient from google.pubsub_v1.types import Encoding # TODO(developer): Replace these variables before running the sample. # project_id = "your-project-id" # topic_id = "your-topic-id" # avsc_file = "path/to/an/avro/schema/file/(.avsc)/formatted/in/json" publisher_client = PublisherClient() topic_path = publisher_client.topic_path(project_id, topic_id) # Prepare to write Avro records to the binary output stream. avro_schema = avro.schema.parse(open(avsc_file, "rb").read()) writer = DatumWriter(avro_schema) bout = io.BytesIO() # Prepare some data using a Python dictionary that matches the Avro schema record = {"name": "Alaska", "post_abbr": "AK"} try: # Get the topic encoding type. topic = publisher_client.get_topic(request={"topic": topic_path}) encoding = topic.schema_settings.encoding # Encode the data according to the message serialization type. if encoding == Encoding.BINARY: encoder = BinaryEncoder(bout) writer.write(record, encoder) data = bout.getvalue() print(f"Preparing a binary-encoded message:\n{data}") elif encoding == Encoding.JSON: data = json.dumps(record).encode("utf-8") print(f"Preparing a JSON-encoded message:\n{data}") else: print(f"No encoding specified in {topic_path}. Abort.") exit(0) future = publisher_client.publish(topic_path, data) print(f"Published message ID: {future.result()}") except NotFound: print(f"{topic_id} not found.")
def compose_data(timestamp, src_vmtype, host_ip, account_id, proc_name): writer = DatumWriter(get_schema()) bytes_writer = io.BytesIO() encoder = avro.io.BinaryEncoder(bytes_writer) message = '{"eventName": "Process_Down", "accountId":"%s", "ProcName":"%s"}' \ % (account_id, proc_name) raw_data = bytes(message) writer.write( { "timestamp": timestamp, "src": src_vmtype, "host_ip": host_ip, "rawdata": raw_data }, encoder) raw_bytes = bytes_writer.getvalue() return raw_bytes
def __encode(self, data, schema=None): if schema is None: out_schema = self.out_schema else: out_schema = schema raw_bytes = None try: writer = DatumWriter(out_schema) bytes_writer = io.BytesIO() encoder = BinaryEncoder(bytes_writer) writer.write(data, encoder) raw_bytes = bytes_writer.getvalue() except: print("Error encoding data", sys.exc_info()) return raw_bytes
def avroSerialize(message): """takes as input a message loops through each part of the message, and attempts to serialize it and return the new AVRO serialized message""" # set up a writer to serialize data writer = DatumWriter(schema) # we set up a try loop because the RTM may return messages # that are not actual messages that fit our Avro schema try: # set up a new converted message new_message = {} new_message['user_id'] = message['user'] new_message['record_type'] = message['type'] new_message['text'] = message['text'] new_message['channel'] = message['channel'] new_message['time_stamp'] = message['ts'] # serialize the message and return it return writer.write(new_message) # if we fail to write successfully, it's probably that we were # attempting to write a message that we don't care about, like # a status change. if this happens, we'll pass except: pass
# # NB: the AvroOutputReader specific portion begins here # def new_column(name, value): column = dict() column['name'] = '%s' % name column['value'] = '%s' % value column['timestamp'] = long(time.time() * 1e6) column['ttl'] = 0 return column # parse the current avro schema proto = avro.protocol.parse(open('cassandra.avpr').read()) schema = proto.types_dict['StreamingMutation'] # open an avro encoder and writer for stdout enc = BinaryEncoder(sys.stdout) writer = DatumWriter(schema) # output a series of objects matching 'StreamingMutation' in the Avro interface smutation = dict() try: for word, count in word2count.iteritems(): smutation['key'] = word smutation['mutation'] = {'column_or_supercolumn': {'column': new_column('count', count)}} writer.write(smutation, enc) finally: sys.stdout.flush()
producer = KafkaProducer(bootstrap_servers = "localhost:9092", compression_type = "gzip") # Kafka topic topic = "tnx" # Path to user.avsc avro schema schema_path = "/home/cloudera/workspace/kafka-clients-python/transactions.avsc" schema = avro.schema.Parse(open(schema_path).read()) print("Schema", schema.to_json()) writer = DatumWriter(schema) bytes_writer = io.BytesIO() encoder = avro.io.BinaryEncoder(bytes_writer) def get_record(): return {"id": "123" , "merchant_id": "m123" , "customer_id": "c345" , "amount": 100.1 , "category": "pos" , "timestamp": int(time())} for i in range(10): record = get_record() writer.write(record, encoder) raw_bytes = bytes_writer.getvalue() producer.send(topic, raw_bytes) producer.flush()
"""Python avro official implementation encoding benchmark.""" from io import BytesIO from itertools import repeat from time import time from avro.datafile import DataFileReader, DataFileWriter from avro.io import DatumReader, DatumWriter, BinaryEncoder, BinaryDecoder import sys LOOPS = 1 with open(sys.argv[1]) as reader: datum_reader = DatumReader() file_reader = DataFileReader(reader, datum_reader) SCHEMA = datum_reader.writers_schema RECORDS = list(file_reader) buf = BytesIO() datum_writer = DatumWriter(SCHEMA) start = time() n = 0 for _ in repeat(None, LOOPS): for record in RECORDS: buf.seek(0) encoder = BinaryEncoder(buf) datum_writer.write(record, encoder) n += 1 print 1000. * (time() - start) / n
def serialize(data): writer = DatumWriter(schema) bytes_writer = io.BytesIO() encoder = avro.io.BinaryEncoder(bytes_writer) writer.write(data, encoder) return bytes_writer.getvalue()
def createAvroMemoryRecord(data,schema): f = StringIO() encoder = BinaryEncoder(f) writer = DatumWriter(schema) writer.write(dict(data),encoder) return f.getvalue()
def toKey(self, x, schema): bytes = io.BytesIO() writer = DatumWriter(schema) writer.write(x, BinaryEncoder(bytes)) bytes.flush() return base64.b64encode(bytes.getvalue())