def test_sanity(): """ Ensures that our "base" and "good" schemas are actually forwards- and backwards-compatible """ # fst schema / record fst_schema = schema.parse(open("%s/MyRecord.base.avsc" % BASE_DIR).read()) fst_writer = DatumWriter(writers_schema=fst_schema) fst_record = { "fieldWithoutDefaultValue": 0, "properField": 0, "enumField": "A", "unionField": None, "arrayField": ["world"], "mapField": {"hello": "world"}, "fixedField": "aaaaaaaaaaaaaaaa" } # sec schema / record sec_schema = schema.parse(open("%s/MyRecord.good.avsc" % BASE_DIR).read()) sec_writer = DatumWriter(writers_schema=sec_schema) sec_record = { "fieldWithoutDefaultValue": 0, "properField2": 0, "enumField": "B", "unionField": None, "arrayField": ["world"], "fixedField": "bbbbbbbbbbbbbbbb" } # Encode record w/ fst fst_buf = StringIO.StringIO() fst_encoder = BinaryEncoder(fst_buf) fst_writer.write(fst_record, fst_encoder) fst_data = fst_buf.getvalue() # Encode record w/ sec sec_buf = StringIO.StringIO() sec_encoder = BinaryEncoder(sec_buf) sec_writer.write(sec_record, sec_encoder) sec_data = sec_buf.getvalue() # writers == fst, readers == sec sec_reader = DatumReader(writers_schema=fst_schema, readers_schema=sec_schema) sec_decoder = BinaryDecoder(StringIO.StringIO(fst_data)) sec_from_fst = sec_reader.read(sec_decoder) # no exception -> good # writers == sec, readers == fst fst_reader = DatumReader(writers_schema=sec_schema, readers_schema=fst_schema) fst_decoder = BinaryDecoder(StringIO.StringIO(sec_data)) fst_from_sec = fst_reader.read(fst_decoder) # no exception -> good
def respond(self, call_request): buffer_reader = io.BytesIO(call_request) buffer_decoder = BinaryDecoder(buffer_reader) buffer_writer = io.BytesIO() buffer_encoder = BinaryEncoder(buffer_writer) error = None response_metadata = {} try: remote_protocol = self.process_handshake(buffer_decoder, buffer_encoder) if remote_protocol is None or self.local_protocol is None: return buffer_writer.getvalue() DatumReader(schema.parse( '{"type": "map", "values": "bytes"}')).read(buffer_decoder) remote_message_name = buffer_decoder.read_utf8() remote_message = remote_protocol.messages.get(remote_message_name) if remote_message is None: fail_msg = 'Unknown remote message: %s' % remote_message_name raise schema.AvroException(fail_msg) local_message = self.local_protocol.messages.get( remote_message_name) if local_message is None: fail_msg = 'Unknown local message: %s' % remote_message_name raise schema.AvroException(fail_msg) writers_schema = remote_message.request readers_schema = local_message.request request = self.read_request(writers_schema, readers_schema, buffer_decoder) response = None try: response = self.invoke(self.local_protocol, local_message, request) except AvroRemoteException as e: error = e except Exception as e: error = AvroRemoteException(str(e)) DatumWriter( schema.parse('{"type": "map", "values": "bytes"}')).write( response_metadata, buffer_encoder) buffer_encoder.write_boolean(error is not None) if error is None: writers_schema = local_message.response self.write_response(writers_schema, response, buffer_encoder) else: writers_schema = local_message.errors self.write_error(writers_schema, error, buffer_encoder) except schema.AvroException as e: error = AvroRemoteException(str(e)) buffer_encoder = BinaryEncoder(io.BytesIO()) DatumWriter( schema.parse('{"type": "map", "values": "bytes"}')).write( response_metadata, buffer_encoder) buffer_encoder.write_boolean(True) self.write_error(schema.parse('["string"]'), error, buffer_encoder) return buffer_encoder.writer.getvalue() return buffer_writer.getvalue()
def serialize(myschema, myobject): buf = io.BytesIO() encoder = BinaryEncoder(buf) writer = DatumWriter(writer_schema=myschema) writer.write(myobject, encoder) buf.seek(0) return (buf.read())
def serialize( self, data, # type: ObjectType schema, # type: Union[str, bytes, avro.schema.Schema] ): # type: (ObjectType, Union[str, bytes, avro.schema.Schema]) -> bytes """Convert the provided value to it's binary representation and write it to the stream. Schema must be a Avro RecordSchema: https://avro.apache.org/docs/1.10.0/gettingstartedpython.html#Defining+a+schema :param data: An object to serialize :type data: ObjectType :param schema: An Avro RecordSchema :type schema: str :returns: Encoded bytes :rtype: bytes """ if not schema: raise ValueError("Schema is required in Avro serializer.") writer = self.get_schema_writer(schema) stream = BytesIO() with stream: writer.write(data, BinaryEncoder(stream)) encoded_data = stream.getvalue() return encoded_data
def serialize(thing): writer = DatumWriter(SCHEMA) buf = StringIO() writer.write(thing, BinaryEncoder(buf)) v = buf.getvalue() buf.close() return v
def _avro_serialize(msg): avro_writer = DatumWriter(self.shared.topic['schema']) bytesio = BytesIO() encoder = BinaryEncoder(bytesio) avro_writer.write(msg, encoder) return bytesio.getvalue()
def send_avro_record_to_kafka(topic, value, bootstrap_servers, avro_schema_json): value_schema = avro.schema.parse(avro_schema_json) producer_config = { "bootstrap.servers": bootstrap_servers, "request.timeout.ms": "1000", } producer = Producer(producer_config) writer = DatumWriter(value_schema) bytes_writer = io.BytesIO() encoder = BinaryEncoder(bytes_writer) writer.write(value, encoder) try: producer.produce(topic=topic, value=bytes_writer.getvalue()) except Exception as e: print( f"Exception while producing record value - {value} to topic - {topic}: {e}" ) else: print(f"Successfully producing record value - {value} to topic - {topic}") producer.flush()
def main(args): log = logging.getLogger(__name__) log.setLevel(logging.INFO) sys_log = logging.handlers.SysLogHandler("/dev/log") sys_format = logging.Formatter( '%(name)s[%(process)d]: %(levelname)s %(message)s') sys_log.setFormatter(sys_format) log.addHandler(sys_log) reader = DataFileReader(open(args.avro_file, "r"), DatumReader()) schema = reader.datum_reader.writers_schema for i, row in enumerate(reader): log.debug("Consumer row:" + str(row)) writer = DatumWriter(schema) bytes_writer = io.BytesIO() encoder = BinaryEncoder(bytes_writer) writer.write(row, encoder) raw_bytes = bytes_writer.getvalue() b64enc = base64.b64encode(raw_bytes) msg = {"messages": [{"data": b64enc}]} json_str = json.dumps(msg) log.debug("json msg:" + json_str) publish(json_str, args.ams_endpoint, args.ams_project, args.ams_topic, args.ams_key, log)
def toKey(self, x, avroType): x = jsonEncoder(avroType, x, False) bytes = io.BytesIO() writer = DatumWriter(avroType.schema) writer.write(x, BinaryEncoder(bytes)) bytes.flush() return base64.b64encode(bytes.getvalue())
def to_avro(self): writer = DatumWriter() writer.set_writer_schema(SYSLOG_AVRO_SCHEMA) buffer = BytesIO() encoder = BinaryEncoder(buffer) writer.write(self.to_avro_dict(), encoder) data = buffer.getbuffer().tobytes() return bytearray(data)
def avro_serialization(value_schema, x): bytes_writer = BytesIO() datum_writer = DatumWriter(writer_schema=value_schema) encoder = BinaryEncoder(bytes_writer) datum_writer.write_data(value_schema, x, encoder) serialized_x = bytes_writer.getvalue() bytes_writer.close() return serialized_x
def __call__(self, state, scope, pos, paramTypes, x): schema = avro.schema.Parse(json.dumps(paramTypes[0])) x = untagUnion(x, paramTypes[0]) bytes_io = io.BytesIO() writer = DatumWriter(schema) writer.write(x, BinaryEncoder(bytes_io)) bytes_io.flush() return bytesToString(bytes_io.getvalue())
def objToBin(): bytes_writer = io.BytesIO() encoder = BinaryEncoder(bytes_writer) writer_binary = DatumWriter(sc) for d in datum: writer_binary.write(d, encoder) ab = bytes_writer.getvalue() return ab
def avro_view(request): data = DataSource().data buffer = BytesIO() schema = SchemaFromJSONData(avro_api_schema) writer = DatumWriter(schema) encoder = BinaryEncoder(buffer) writer.write(data, encoder) return HttpResponse(buffer.getvalue(), content_type='application/octet-stream')
def avro_encode(cls, json_data, schema=None): """avro 序列化json数据为二进制 :param json_data: :param schema: :return: """ bio = BytesIO() binary_encoder = BinaryEncoder(bio) dw = DatumWriter(writer_schema=schema or cls.RESPONSE_SCHEMA) dw.write(json_data, binary_encoder) return bio.getvalue()
def encode(self, data): raw_bytes = None try: writer = DatumWriter(self.schema) bytes_writer = io.BytesIO() encoder = BinaryEncoder(bytes_writer) writer.write(data, encoder) raw_bytes = bytes_writer.getvalue() except: print("Error encoding data", sys.exc_info()) return raw_bytes
def encode(self, schema_file, data): raw_bytes = None try: schema = avro.schema.Parse(open(schema_file).read()) writer = DatumWriter(schema) bytes_writer = io.BytesIO() encoder = BinaryEncoder(bytes_writer) writer.write(data, encoder) raw_bytes = bytes_writer.getvalue() except: print("Error encode data", sys.exc_info()) return raw_bytes
def _avro_serialize(msg): opened_schema = load_schema(schema) avro_writer = DatumWriter(opened_schema) bytesio = BytesIO() encoder = BinaryEncoder(bytesio) if isinstance(msg, list): for m in msg: avro_writer.write(m, encoder) else: avro_writer.write(msg, encoder) return bytesio.getvalue()
def write_value(schema: TypedSchema, bio: io.BytesIO, value: dict): if schema.schema_type is SchemaType.AVRO: writer = DatumWriter(schema.schema) writer.write(value, BinaryEncoder(bio)) elif schema.schema_type is SchemaType.JSONSCHEMA: try: schema.schema.validate(value) except ValidationError as e: raise InvalidPayload from e bio.write(json_encode(value, binary=True)) else: raise ValueError("Unknown schema type")
def __serialize_as_needed(self, key, value): out_kv = {'K': key, 'V': value} jc = self.job_conf if AVRO_OUTPUT in jc and (self.is_reducer() or self.__is_map_only()): for mode, record in out_kv.iteritems(): datum_writer = self.__datum_writers.get(mode) if datum_writer is not None: f = StringIO() encoder = BinaryEncoder(f) datum_writer.write(record, encoder) out_kv[mode] = f.getvalue() return out_kv['K'], out_kv['V']
def serialize(tweets): if tweets is not None: schema_tweet = avro.schema.Parse( open(dir_path + "/tweet.schema.avsc", "rb").read()) writer = DatumWriter() bytes_writer = BytesIO() encoder = BinaryEncoder(bytes_writer) writer.write_array(schema_tweet, tweets, encoder) tweets_bytes = bytes_writer.getvalue() return tweets_bytes else: return None
def publish_avro_records(project_id, topic_id, avsc_file): """Pulbish a BINARY or JSON encoded message to a topic configured with an Avro schema.""" # [START pubsub_publish_avro_records] from avro.io import BinaryEncoder, DatumWriter import avro import io import json from google.api_core.exceptions import NotFound from google.cloud.pubsub import PublisherClient from google.pubsub_v1.types import Encoding # TODO(developer): Replace these variables before running the sample. # project_id = "your-project-id" # topic_id = "your-topic-id" # avsc_file = "path/to/an/avro/schema/file/(.avsc)/formatted/in/json" publisher_client = PublisherClient() topic_path = publisher_client.topic_path(project_id, topic_id) # Prepare to write Avro records to the binary output stream. avro_schema = avro.schema.parse(open(avsc_file, "rb").read()) writer = DatumWriter(avro_schema) bout = io.BytesIO() # Prepare some data using a Python dictionary that matches the Avro schema record = {"name": "Alaska", "post_abbr": "AK"} try: # Get the topic encoding type. topic = publisher_client.get_topic(request={"topic": topic_path}) encoding = topic.schema_settings.encoding # Encode the data according to the message serialization type. if encoding == Encoding.BINARY: encoder = BinaryEncoder(bout) writer.write(record, encoder) data = bout.getvalue() print(f"Preparing a binary-encoded message:\n{data}") elif encoding == Encoding.JSON: data = json.dumps(record).encode("utf-8") print(f"Preparing a JSON-encoded message:\n{data}") else: print(f"No encoding specified in {topic_path}. Abort.") exit(0) future = publisher_client.publish(topic_path, data) print(f"Published message ID: {future.result()}") except NotFound: print(f"{topic_id} not found.")
def serialize_avro(payload_str, schema): """ Function used to serialize a json event to binary format based on avro schema :param schema: avro schema of payload :param payload_str: event data in json string format :return: avro serialized binary data and corresponding schema """ payload_json = json.loads(payload_str) writer = DatumWriter(schema) bytes_writer = io.BytesIO() encoder = BinaryEncoder(bytes_writer) writer.write(payload_json, encoder) raw_bytes = bytes_writer.getvalue() return raw_bytes, schema
def __encode(self, data, schema): raw_bytes = None try: writer = DatumWriter(schema) bytes_writer = io.BytesIO() encoder = BinaryEncoder(bytes_writer) writer.write(data, encoder) raw_bytes = bytes_writer.getvalue() except Exception as e: print(f'Error encoding data: {repr(e)}') return raw_bytes
def __init__(self, boostrap_servers, topic, deployment_id, data_scheme_filename, label_scheme_filename, description='', validation_rate=0, control_topic='control', group_id='sink'): input_format = 'AVRO' super().__init__(boostrap_servers, topic, deployment_id, input_format, description, validation_rate, control_topic, group_id) self.data_scheme_filename = data_scheme_filename self.data_schema = open(self.data_scheme_filename, "r").read() self.avro_data_schema = avro.schema.Parse(self.data_schema) self.data_writer = DatumWriter(self.avro_data_schema) self.label_scheme_filename = label_scheme_filename self.label_schema = open(self.label_scheme_filename, "r").read() self.avro_label_schema = avro.schema.Parse(self.label_schema) self.label_writer = DatumWriter(self.avro_label_schema) self.data_io = io.BytesIO() self.label_io = io.BytesIO() self.data_encoder = BinaryEncoder(self.data_io) self.label_encoder = BinaryEncoder(self.label_io) self.input_config = { 'data_scheme': self.data_schema, 'label_scheme': self.label_schema, }
def __encode(self, data, schema=None): if schema is None: out_schema = self.out_schema else: out_schema = schema raw_bytes = None try: writer = DatumWriter(out_schema) bytes_writer = io.BytesIO() encoder = BinaryEncoder(bytes_writer) writer.write(data, encoder) raw_bytes = bytes_writer.getvalue() except: print("Error encoding data", sys.exc_info()) return raw_bytes
def encode_avro(msg, writer): """Function to encode bytes for the Avro serilization Args: msg ([dict]): dictionary with the data for serialize writer ([DatumWriter]): special variable of Avro schema for the serilization Returns: [bytes]: message translate to bytes """ bytes_writer = io.BytesIO() encoder = BinaryEncoder(bytes_writer) writer.write(msg, encoder) raw_bytes = bytes_writer.getvalue() return raw_bytes
def produce(self, msg): if self.ser_type == kfkcfg.SERIALIZATIO_JSON: # s = json.dumps(msg) s = json.dumps(msg, default=json_util.default) future = self.kfkprod.produce(bytes(s, 'utf-8')) # msg = json.dumps(msg, default=json_util.default).encode('utf-8') # future = self.kfkprod.produce(bytes(msg)) elif self.ser_type == kfkcfg.SERIALIZATIO_AVRO: writer = DatumWriter(self.avro_schema) bytes_writer = io.BytesIO() encoder = BinaryEncoder(bytes_writer) writer.write(msg, encoder) raw_bytes = bytes_writer.getvalue() future = self.kfkprod.produce(raw_bytes)
def __init__(self, boostrap_servers, topic, data_scheme_filename, group_id='sink'): self.boostrap_servers = boostrap_servers self.topic = topic self.data_scheme_filename = data_scheme_filename self.data_schema = open(self.data_scheme_filename, "r").read() self.avro_data_schema = avro.schema.Parse(self.data_schema) self.data_writer = DatumWriter(self.avro_data_schema) self.data_io = io.BytesIO() self.data_encoder = BinaryEncoder(self.data_io) self.__producer = KafkaProducer( bootstrap_servers=self.boostrap_servers)
def send_avro_record_to_kafka(topic, value, bootstrap_servers, avro_schema_json): value_schema = avro.schema.parse(avro_schema_json) producer = KafkaProducer(bootstrap_servers=bootstrap_servers) writer = DatumWriter(value_schema) bytes_writer = io.BytesIO() encoder = BinaryEncoder(bytes_writer) writer.write(value, encoder) try: producer.send(topic=topic, value=bytes_writer.getvalue()) except Exception as e: print( f"Exception while producing record value - {value} to topic - {topic}: {e}" ) else: print( f"Successfully producing record value - {value} to topic - {topic}" ) producer.flush()