def __init__(self, schema_json=None, key_json=None, value_json=None): """ This class is initiated with a json string representation of a RADAR-base schema. Parameters __________ schema_json: string (json) A json string representation of a key-value pair RADAR-base schema key_json: string (json) A json string representation of a key RADAR-base schema value_json: string (json) A json string representation of a value RADAR-base schema __________ Either schema_json or value_json must be specified. key_json may also be given alongside value_json. """ if schema_json: self.schema = schema.Parse(schema_json) elif value_json: if key_json: self.schema = schema.Parse( combine_key_value_schemas(key_json, value_json)) else: self.schema = self._FakeSchema() self.schema.fields = [self._FakeSchema()] self.schema.fields[0].type = schema.Parse(value_json) self.schema.fields[0].name = 'value' else: raise ValueError('Please provide json representation of a' 'key-value schema or a value schema with or' 'without a seperate key schema.')
def testOtherAttributes(self): correct = 0 props = {} for example in OTHER_PROP_EXAMPLES: original_schema = schema.Parse(example.schema_string) round_trip_schema = schema.Parse(str(original_schema)) self.assertEqual(original_schema.other_props, round_trip_schema.other_props) if original_schema.type == "record": field_props = 0 for f in original_schema.fields: if f.other_props: props.update(f.other_props) field_props += 1 self.assertEqual(field_props, len(original_schema.fields)) if original_schema.other_props: props.update(original_schema.other_props) correct += 1 for k in props: v = props[k] if k == "cp_boolean": self.assertEqual(type(v), bool) elif k == "cp_int": self.assertEqual(type(v), int) elif k == "cp_object": self.assertEqual(type(v), dict) elif k == "cp_float": self.assertEqual(type(v), float) elif k == "cp_array": self.assertEqual(type(v), list) self.assertEqual(correct, len(OTHER_PROP_EXAMPLES))
def test_schema_versions(self): logger.info("Schemas") name = 'property' self.register = Registry(path_configs) r_schema_1 = self.register.get(name, version=1) r_schema_2 = self.register.get(name, version=2) r_schema_3 = self.register.get(name, version=3) r_schema_4 = self.register.get(name, version=4) r_schema_5 = self.register.get(name, version=5) _file_test_1 = schema.Parse(_avro_test_1) _file_test_2 = schema.Parse(_avro_test_2) _file_test_3 = schema.Parse(_avro_test_3) _file_test_4 = schema.Parse(_avro_test_4) _file_test_5 = schema.Parse(_avro_test_5) self.assertEqual(r_schema_1, _file_test_1) self.assertEqual(r_schema_2, _file_test_2) self.assertEqual(r_schema_3, _file_test_3) self.assertEqual(r_schema_4, _file_test_4) self.assertEqual(r_schema_5, _file_test_5) self.assertRaises(SchemaVersionNotFound, lambda: self.register.get(name, version=6))
def testValidCastToStringAfterParse(self): """ Test that the string generated by an Avro Schema object is, in fact, a valid Avro schema. """ correct = 0 for example in VALID_EXAMPLES: schema_data = schema.Parse(example.schema_string) schema.Parse(str(schema_data)) correct += 1 fail_msg = "Cast to string success on %d out of %d schemas" % \ (correct, len(VALID_EXAMPLES)) self.assertEqual(correct, len(VALID_EXAMPLES), fail_msg)
def testUnknownSymbol(self): writer_schema = schema.Parse("""\ {"type": "enum", "name": "Test", "symbols": ["FOO", "BAR"]}""") datum_to_write = 'FOO' reader_schema = schema.Parse("""\ {"type": "enum", "name": "Test", "symbols": ["BAR", "BAZ"]}""") writer, encoder, datum_writer = write_datum(datum_to_write, writer_schema) reader = io.BytesIO(writer.getvalue()) decoder = avro_io.BinaryDecoder(reader) datum_reader = avro_io.DatumReader(writer_schema, reader_schema) self.assertRaises(avro_io.SchemaResolutionException, datum_reader.read, decoder)
def check_skip_number(number_type): logging.debug('Testing skip number for %s', number_type) correct = 0 for value_to_skip, hex_encoding in BINARY_ENCODINGS: VALUE_TO_READ = 6253 logging.debug('Value to Skip: %d', value_to_skip) # write the value to skip and a known value writer_schema = schema.Parse('"%s"' % number_type.lower()) writer, encoder, datum_writer = write_datum(value_to_skip, writer_schema) datum_writer.write(VALUE_TO_READ, encoder) # skip the value reader = io.BytesIO(writer.getvalue()) decoder = avro_io.BinaryDecoder(reader) decoder.skip_long() # read data from string buffer datum_reader = avro_io.DatumReader(writer_schema) read_value = datum_reader.read(decoder) logging.debug('Read Value: %d', read_value) if read_value == VALUE_TO_READ: correct += 1 return correct
def testMetadata(self): file_path = self.NewTempFile() # Test the writer with a 'with' statement. with open(file_path, 'wb') as writer: datum_writer = io.DatumWriter() sample_schema, sample_datum = SCHEMAS_TO_VALIDATE[1] schema_object = schema.Parse(sample_schema) with datafile.DataFileWriter(writer, datum_writer, schema_object) as dfw: dfw.SetMeta('test.string', 'foo') dfw.SetMeta('test.number', '1') dfw.append(sample_datum) self.assertTrue(writer.closed) # Test the reader with a 'with' statement. datums = [] with open(file_path, 'rb') as reader: datum_reader = io.DatumReader() with datafile.DataFileReader(reader, datum_reader) as dfr: self.assertEqual(b'foo', dfr.GetMeta('test.string')) self.assertEqual(b'1', dfr.GetMeta('test.number')) for datum in dfr: datums.append(datum) self.assertTrue(reader.closed)
def test_container(self): writer = open('data.avro', 'wb') datum_writer = io.DatumWriter() schema_object = schema.Parse("""\ { "type": "record", "name": "StringPair", "doc": "A pair of strings.", "fields": [ {"name": "left", "type": "string"}, {"name": "right", "type": "string"} ] } """) dfw = datafile.DataFileWriter(writer, datum_writer, schema_object) datum = {'left': 'L', 'right': 'R'} dfw.append(datum) dfw.close() reader = open('data.avro', 'rb') datum_reader = io.DatumReader() dfr = datafile.DataFileReader(reader, datum_reader) data = [] for datum in dfr: data.append(datum) dfr.close() self.assertEquals(1, len(data)) self.assertEquals(datum, data[0])
def testSchemaPromotion(self): # note that checking writer_schema.type in read_data # allows us to handle promotion correctly promotable_schemas = ['"int"', '"long"', '"float"', '"double"'] incorrect = 0 for i, ws in enumerate(promotable_schemas): writer_schema = schema.Parse(ws) datum_to_write = 219 for rs in promotable_schemas[i + 1:]: reader_schema = schema.Parse(rs) writer, enc, dw = write_datum(datum_to_write, writer_schema) datum_read = read_datum(writer, writer_schema, reader_schema) logging.debug('Writer: %s Reader: %s', writer_schema, reader_schema) logging.debug('Datum Read: %s', datum_read) if datum_read != datum_to_write: incorrect += 1 self.assertEqual(incorrect, 0)
def __init__(self, reader, datum_reader): """Initializes a new data file reader. Args: reader: Open file to read from. datum_reader: Avro datum reader. """ self._reader = reader self._raw_decoder = avro_io.BinaryDecoder(reader) self._datum_decoder = None # Maybe reset at every block. self._datum_reader = datum_reader # read the header: magic, meta, sync self._read_header() # ensure codec is valid avro_codec_raw = self.GetMeta('avro.codec') if avro_codec_raw is None: self.codec = "null" else: self.codec = avro_codec_raw.decode('utf-8') if self.codec not in VALID_CODECS: raise DataFileException('Unknown codec: %s.' % self.codec) self._file_length = self._GetInputFileLength() # get ready to read self._block_count = 0 self.datum_reader.writer_schema = (schema.Parse( self.GetMeta(SCHEMA_KEY).decode('utf-8')))
def update(topic, schema_config, force=False): """Given a topic, update (or create) a schema""" client = CachedSchemaRegistryClient(schema_config) if topic == 'all': schema_files = Path(__file__).parent.glob('**/*.avsc') else: schema_files = Path(__file__).parent.glob(f'**/{topic}-*.avsc') for schema_file in schema_files: with open(schema_file) as f: schema_str = f.read() schema_dict = json.loads(schema_str) avro_schema = schema.Parse(schema_str) subject = schema_dict['namespace'].replace('.', '-') + '-' + schema_dict['name'] if force: client.update_compatibility('NONE', subject=subject) else: client.update_compatibility('BACKWARD', subject=subject) try: schema_id = client.register(subject, avro_schema) log.info(f'Added/updated {schema_file}\t Schema ID {schema_id}') except avro_error.ClientError as error: log.error(f'Error adding/updating {schema_file}: {error.message}')
def testParse(self): correct = 0 for iexample, example in enumerate(EXAMPLES): logging.debug('Testing example #%d\n%s', iexample, example.schema_string) try: schema.Parse(example.schema_string) if example.valid: correct += 1 else: self.fail('Invalid schema was parsed:\n%s' % example.schema_string) except Exception as exn: if example.valid: self.fail('Valid schema failed to parse: %r\n%s' % (example.schema_string, traceback.format_exc())) else: if logging.getLogger().getEffectiveLevel() <= 5: logging.debug('Expected error:\n%s', traceback.format_exc()) else: logging.debug('Expected error: %r', exn) correct += 1 self.assertEqual( correct, len(EXAMPLES), 'Parse behavior correct on %d out of %d schemas.' % (correct, len(EXAMPLES)), )
def testValidateUnion(self): example_schema = """\ ["int", "null"] """ datum = None result = avro_io.Validate(schema.Parse(example_schema), datum) self.assertTrue(result)
def deserialize(x): schema_path = "data/files/fb_scheam.avsc" schema1 = schema.Parse(open(schema_path).read()) bytes_reader = io2.BytesIO(x) decoder = io.BinaryDecoder(bytes_reader) reader = io.DatumReader(schema1) message = reader.read(decoder) return message
def _write_items(base_name, schema_str, items): avro_schema = schema.Parse(schema_str) avro_file = base_name + '.avro' with DataFileWriter(open(avro_file, "w"), DatumWriter(), avro_schema) as writer: for i in items: writer.append(i) writer.close return (avro_file)
def testTypeException(self): writer_schema = schema.Parse("""\ {"type": "record", "name": "Test", "fields": [{"name": "F", "type": "int"}, {"name": "E", "type": "int"}]}""") datum_to_write = {'E': 5, 'F': 'Bad'} self.assertRaises( avro_io.AvroTypeException, write_datum, datum_to_write, writer_schema)
def testValidateUnionError(self): example_schema = """\ ["int", "null"] """ datum = "there should not be a string here" expected_regex = "datum should be one of following: \['int', 'null']" with self.assertRaisesRegex(avro_io.AvroTypeException, expected_regex): avro_io.Validate(schema.Parse(example_schema), datum)
def testValidate(self): passed = 0 for example_schema, datum in SCHEMAS_TO_VALIDATE: logging.debug('Schema: %r', example_schema) logging.debug('Datum: %r', datum) validated = avro_io.Validate(schema.Parse(example_schema), datum) logging.debug('Valid: %s', validated) if validated: passed += 1 self.assertEqual(passed, len(SCHEMAS_TO_VALIDATE))
def testDuplicateRecordField(self): schema_string = """{ "type": "record", "name": "Test", "fields": [{"name": "foo", "type": "int"}, {"name": "foo", "type": "string"}] }""" with self.assertRaises(schema.SchemaParseException) as e: schema.Parse(schema_string) self.assertRegex(str(e.exception), 'Duplicate.*field name.*foo')
def generate(schema_file, output_path): interop_schema = schema.Parse(open(schema_file, 'r').read()) datum_writer = io.DatumWriter() for codec in datafile.VALID_CODECS: filename = 'py3' if codec != 'null': filename += '_' + codec with Path(output_path, filename).with_suffix('.avro').open('wb') as writer, \ datafile.DataFileWriter(writer, datum_writer, interop_schema, codec) as dfw: dfw.append(DATUM)
def consumer2(): consumer = KafkaConsumer('test') schema_path = "data/files/fb_scheam.avsc" schema1 = schema.Parse(open(schema_path).read()) for msg in consumer: bytes_reader = io2.BytesIO(msg.value) decoder = io.BinaryDecoder(bytes_reader) reader = io.DatumReader(schema1) message = reader.read(decoder) return (message)
def dict_to_json(data: Dict): # to JSON # avro_schema = schema.SchemaFromJSONData(schema_dict) avro_schema = schema.Parse(open("rate.avsc", "rb").read()) serializer = AvroJsonSerializer(avro_schema) json_str = serializer.to_json(data) pretty_print(json_str)
def test_write_data(self): writer = open('pairs.avro', 'wb') datum_writer = io.DatumWriter() schema_object = schema.Parse(open('Pair.avsc').read()) dfw = datafile.DataFileWriter(writer, datum_writer, schema_object) dfw.append({'left': 'a', 'right': '1'}) dfw.append({'left': 'c', 'right': '2'}) dfw.append({'left': 'b', 'right': '3'}) dfw.append({'left': 'b', 'right': '2'}) dfw.close()
def testValidateShouldRaiseFormattedError(self): example_schema = '{"type": "int"}' datum = "aaa" expected_regex = "datum should be int type," \ " but as value we got 'aaa'" with self.assertRaisesRegex(avro_io.AvroPrimitiveTypeException, expected_regex): avro_io.Validate(schema.Parse(example_schema), datum)
def __init__( self, writer, datum_writer, writer_schema=None, codec='null', ): """Constructs a new DataFileWriter instance. If the schema is not present, presume we're appending. Args: writer: File-like object to write into. datum_writer: writer_schema: Schema codec: """ self._writer = writer self._encoder = avro_io.BinaryEncoder(writer) self._datum_writer = datum_writer self._buffer_writer = io.BytesIO() self._buffer_encoder = avro_io.BinaryEncoder(self._buffer_writer) self._block_count = 0 self._meta = {} # Ensure we have a writer that accepts bytes: self._writer.write(b'') # Whether the header has already been written: self._header_written = False if writer_schema is not None: if codec not in VALID_CODECS: raise DataFileException('Unknown codec: %r' % codec) self._sync_marker = DataFileWriter.GenerateSyncMarker() self.SetMeta('avro.codec', codec) self.SetMeta('avro.schema', str(writer_schema).encode('utf-8')) self.datum_writer.writer_schema = writer_schema else: # open writer for reading to collect metadata dfr = DataFileReader(writer, avro_io.DatumReader()) # TODO: collect arbitrary metadata # collect metadata self._sync_marker = dfr.sync_marker self.SetMeta('avro.codec', dfr.GetMeta('avro.codec')) # get schema used to write existing file schema_from_file = dfr.GetMeta('avro.schema').decode('utf-8') self.SetMeta('avro.schema', schema_from_file) self.datum_writer.writer_schema = schema.Parse(schema_from_file) # seek to the end of the file and prepare for writing writer.seek(0, 2) self._header_written = True
def serialize(self, items): schema_path = "fb_scheam.avsc" SCHEMA = schema.Parse(open(schema_path).read()) writer = io.DatumWriter(SCHEMA) bytes_writer = io2.BytesIO() encoder = io.BinaryEncoder(bytes_writer) # There must be a better way of writing this item that isn't so long writer.write(get_as_json(items), encoder) raw_bytes = bytes_writer.getvalue() return raw_bytes
def callback(ch, method, properties, body): start_time = time.clock() bytes_reader = BytesIO(body) decoder = avro_io.BinaryDecoder(bytes_reader) reader = avro_io.DatumReader( schema.Parse(open(f"schemas/{exchange}.avsc", "rb").read())) event_body = reader.read(decoder) time.sleep(0.1) # Mock feature computing time print(f"Event received:" f"size: {sys.getsizeof(event_body)} bytes," f"time: {time.clock() - start_time} secs")
def testAppend(self): correct = 0 codecs_to_validate = get_codecs_to_validate() for iexample, (writer_schema, datum) in enumerate(SCHEMAS_TO_VALIDATE): for codec in codecs_to_validate: file_path = self.NewTempFile() logging.debug( 'Performing append with codec %r in file %s for example #%d\n' 'Writing datum: %r using writer schema:\n%s', codec, file_path, iexample, datum, writer_schema) logging.debug('Creating data file %r', file_path) with open(file_path, 'wb') as writer: datum_writer = io.DatumWriter() schema_object = schema.Parse(writer_schema) with datafile.DataFileWriter( writer=writer, datum_writer=datum_writer, writer_schema=schema_object, codec=codec, ) as dfw: dfw.append(datum) logging.debug('Appending data to %r', file_path) for i in range(9): with open(file_path, 'ab+') as writer: with datafile.DataFileWriter(writer, io.DatumWriter()) as dfw: dfw.append(datum) logging.debug('Reading appended data from %r', file_path) with open(file_path, 'rb') as reader: datum_reader = io.DatumReader() with datafile.DataFileReader(reader, datum_reader) as dfr: appended_data = list(dfr) logging.debug( 'Appended data has %d items: %r', len(appended_data), appended_data) if ([datum] * 10) == appended_data: correct += 1 else: logging.error( 'Appended data does not match:\n' 'Expect: %r\n' 'Actual: %r', [datum] * 10, appended_data) self.assertEqual( correct, len(codecs_to_validate) * len(SCHEMAS_TO_VALIDATE))
def testDocAttributes(self): correct = 0 for example in DOC_EXAMPLES: original_schema = schema.Parse(example.schema_string) if original_schema.doc is not None: correct += 1 if original_schema.type == 'record': for f in original_schema.fields: if f.doc is None: self.fail("Failed to preserve 'doc' in fields: " + example.schema_string) self.assertEqual(correct, len(DOC_EXAMPLES))
def testCorrectRecursiveExtraction(self): parsed = schema.Parse(""" { "type": "record", "name": "X", "fields": [{ "name": "y", "type": { "type": "record", "name": "Y", "fields": [{"name": "Z", "type": "X"}, {"name": "W", "type": "X"}] } }] } """) logging.debug('Parsed schema:\n%s', parsed) logging.debug('Fields: %s', parsed.fields) t = schema.Parse(str(parsed.fields[0].type)) # If we've made it this far, the subschema was reasonably stringified; # it could be reparsed. self.assertEqual("X", t.fields[0].type.name)