def _get_decoder_func(self, schema_id, payload, is_key=False): if schema_id in self.id_to_decoder_func: return self.id_to_decoder_func[schema_id] # fetch writer schema from schema reg try: writer_schema_obj = self.registry_client.get_by_id(schema_id) except ClientError as e: raise SerializerError("unable to fetch schema with id %d: %s" % (schema_id, str(e))) if writer_schema_obj is None: raise SerializerError("unable to fetch schema with id %d" % (schema_id)) curr_pos = payload.tell() reader_schema_obj = (self.reader_key_schema if is_key else self.reader_value_schema) if HAS_FAST: # try to use fast avro try: fast_avro_writer_schema = parse_schema( writer_schema_obj.to_json()) fast_avro_reader_schema = parse_schema( reader_schema_obj.to_json()) schemaless_reader(payload, fast_avro_writer_schema) # If we reach this point, this means we have fastavro and it can # do this deserialization. Rewind since this method just determines # the reader function and we need to deserialize again along the # normal path. payload.seek(curr_pos) self.id_to_decoder_func[ schema_id] = lambda p: schemaless_reader( p, fast_avro_writer_schema, fast_avro_reader_schema) return self.id_to_decoder_func[schema_id] except Exception: # Fast avro failed, fall thru to standard avro below. pass # here means we should just delegate to slow avro # rewind payload.seek(curr_pos) # Avro DatumReader py2/py3 inconsistency, hence no param keywords # should be revisited later # https://github.com/apache/avro/blob/master/lang/py3/avro/io.py#L459 # https://github.com/apache/avro/blob/master/lang/py/src/avro/io.py#L423 # def __init__(self, writers_schema=None, readers_schema=None) # def __init__(self, writer_schema=None, reader_schema=None) avro_reader = avro.io.DatumReader(writer_schema_obj, reader_schema_obj) def decoder(p): bin_decoder = avro.io.BinaryDecoder(p) return avro_reader.read(bin_decoder) self.id_to_decoder_func[schema_id] = decoder return self.id_to_decoder_func[schema_id]
def test_unknown_type(): schema = { "type": "unknown", } with pytest.raises(UnknownType): parse_schema(schema)
def test_union_schemas_must_have_names_in_order(): """https://github.com/fastavro/fastavro/issues/450""" schema1 = [ { "name": "Location", "type": "record", "fields": [{"name": "city", "type": "long"}], }, { "name": "Weather", "type": "record", "fields": [{"name": "of", "type": "Location"}], }, ] # This should work because Location is defined first parse_schema(schema1) schema2 = [ { "name": "Weather", "type": "record", "fields": [{"name": "of", "type": "Location"}], }, { "name": "Location", "type": "record", "fields": [{"name": "city", "type": "long"}], }, ] # This should not work because Location is defined after it is used with pytest.raises(UnknownType): parse_schema(schema2)
def test_decimal_fixed_accommodates_precision(): """https://github.com/fastavro/fastavro/issues/457""" schema = { "type": "record", "name": "test_scale_is_an_int", "fields": [{ "name": "field", "type": { "name": "fixed_decimal", "logicalType": "decimal", "precision": 10, "scale": 2, "type": "fixed", "size": 2, }, }], } with pytest.raises( SchemaParseException, match= r"decimal precision of \d+ doesn't fit into array of length \d+", ): parse_schema(schema)
def test_aliases_is_a_list(): """https://github.com/fastavro/fastavro/issues/206""" schema = { "type": "record", "name": "test_parse_schema", "fields": [{ "name": "field", "type": "string", "aliases": "foobar", }], } with pytest.raises(SchemaParseException): parse_schema(schema)
def test_parse_schema(): schema = { "type": "record", "name": "test_parse_schema", "fields": [{ "name": "field", "type": "string", }], } parsed_schema = parse_schema(schema) assert "__fastavro_parsed" in parsed_schema parsed_schema_again = parse_schema(parsed_schema) assert parsed_schema_again == parsed_schema
def test_parse_schema_includes_hint_with_list(): """https://github.com/fastavro/fastavro/issues/444""" schema = [ { "type": "record", "name": "test_parse_schema_includes_hint_with_list_1", "doc": "blah", "fields": [{ "name": "field1", "type": "string", "default": "" }], }, { "type": "record", "name": "test_parse_schema_includes_hint_with_list_2", "doc": "blah", "fields": [{ "name": "field2", "type": "string", "default": "" }], }, ] parsed_schema = parse_schema(schema) for s in parsed_schema: assert "__fastavro_parsed" in s
def test_enum_named_type(): """https://github.com/fastavro/fastavro/issues/450""" schema = { "type": "record", "name": "test_enum_named_type", "fields": [ { "name": "test1", "type": { "type": "enum", "name": "my_enum", "symbols": ["FOO", "BAR"], }, }, { "name": "test2", "type": "my_enum", }, ], } records = [{"test1": "FOO", "test2": "BAR"}] parsed_schema = parse_schema(schema) assert records == roundtrip(parsed_schema, records)
def test_fixed_named_type(): """https://github.com/fastavro/fastavro/issues/450""" schema = { "type": "record", "name": "test_fixed_named_type", "fields": [ { "name": "test1", "type": { "type": "fixed", "name": "my_fixed", "size": 4, }, }, { "name": "test2", "type": "my_fixed", }, ], } records = [{"test1": b"1234", "test2": b"4321"}] parsed_schema = parse_schema(schema) assert records == roundtrip(parsed_schema, records)
def test_record_named_type(): """https://github.com/fastavro/fastavro/issues/450""" schema = { "type": "record", "name": "test_record_named_type", "fields": [ { "name": "test1", "type": { "type": "record", "name": "my_record", "fields": [{ "name": "field1", "type": "string", }], }, }, { "name": "test2", "type": "my_record", }, ], } records = [{"test1": {"field1": "foo"}, "test2": {"field1": "bar"}}] parsed_schema = parse_schema(schema) assert records == roundtrip(parsed_schema, records)
def _get_encoder_func(self, writer_schema): if HAS_FAST: schema = writer_schema.to_json() parsed_schema = parse_schema(schema) return lambda record, fp: schemaless_writer(fp, parsed_schema, record) writer = avro.io.DatumWriter(writer_schema) return lambda record, fp: writer.write(record, avro.io.BinaryEncoder(fp))
def write_read(in_data): buff = io.BytesIO() rec_schema = rec_avro_schema() writer(buff, schema.parse_schema(rec_schema), in_data) buff.seek(0) return [r for r in reader(buff)]
def test_with_dependent_schema(): """Tests a schema with dependent schema https://github.com/fastavro/fastavro/issues/418""" dependency = { "type": "record", "name": "Dependency", "namespace": "test", "fields": [{"name": "_name", "type": "string"}], } schema = { "type": "record", "name": "Test", "namespace": "test", "fields": [ {"name": "_name", "type": "string"}, {"name": "_dependency", "type": "Dependency"}, ], } records = [{"_name": "parent", "_dependency": {"_name": "child"}}] parsed_schema = parse_schema([dependency, schema]) new_records = roundtrip(parsed_schema, records) assert records == new_records
def test_doc_left_in_parse_schema(): schema = { "type": "record", "name": "test_doc_left_in_parse_schema", "doc": "blah", "fields": [{"name": "field1", "type": "string", "default": ""}], } assert schema == parse_schema(schema, _write_hint=False)
def _get_encoder_func(self, writer_schema: str) -> callable: if HAS_FAST: schema = json.loads(writer_schema) parsed_schema = parse_schema(schema) return lambda record, outf: schemaless_writer(outf, parsed_schema, record) parsed_schema = avro.schema.parse(writer_schema) writer = avro.io.DatumWriter(parsed_schema) return lambda record, outf: writer.write(record, avro.io.BinaryEncoder(outf))
def test_scale_is_an_int(): """https://github.com/fastavro/fastavro/issues/262""" schema = { "type": "record", "name": "test_scale_is_an_int", "fields": [{ "name": "field", "type": { "logicalType": "decimal", "precision": 5, "scale": "2", "type": "bytes", }, }], } with pytest.raises( SchemaParseException, match="decimal scale must be a postive integer" ): parse_schema(schema)
def test_enum_symbols_validation__uniqueness(): """https://github.com/fastavro/fastavro/issues/551""" invalid_schema = { "type": "record", "name": "my_schema", "fields": [{ "name": "enum_field", "type": { "name": "my_enum", "type": "enum", "symbols": ["FOO", "BAR", "FOO"], }, }], } with pytest.raises(SchemaParseException) as err: parse_schema(invalid_schema) assert str(err.value) == "All symbols in an enum must be unique"
def test_using_named_schemas_to_handle_references(): location = { "name": "Location", "type": "record", "fields": [{ "name": "city", "type": "long" }], } weather = { "name": "Weather", "type": "record", "fields": [{ "name": "of", "type": "Location" }], } named_schemas = {} parse_schema(location, named_schemas) parse_schema(weather, named_schemas) # This should not work because didn't supply the named schemas with pytest.raises(UnknownType): parse_schema(weather)
def test_explicit_null_namespace_2(): """https://github.com/fastavro/fastavro/issues/537""" schema = { "type": "record", "name": "my_schema", "namespace": None, "fields": [{ "name": "subfield", "type": "string" }], } parsed_schema = parse_schema(schema) assert parsed_schema["name"] == "my_schema"
def test_aliases_are_preserved(): schema = { "type": "record", "name": "test_parse_schema", "fields": [{ "name": "field", "type": "string", "aliases": ["test"], }], } parsed_schema = parse_schema(schema) assert "aliases" in parsed_schema["fields"][0]
def test_enum_symbols_validation__correct(symbol): """https://github.com/fastavro/fastavro/issues/551""" invalid_schema = { "type": "record", "name": "my_schema", "fields": [{ "name": "enum_field", "type": { "name": "my_enum", "type": "enum", "symbols": [symbol], }, }], } try: parse_schema(invalid_schema) except SchemaParseException: pytest.fail( f"valid symbol {symbol} has been incorrectly marked as invalid.")
def test_precision_is_an_int(): """https://github.com/fastavro/fastavro/issues/262""" schema = { "type": "record", "name": "test_scale_is_an_int", "fields": [{ "name": "field", "type": { "logicalType": "decimal", "precision": "5", "scale": 2, "type": "bytes", }, }], } with pytest.raises(SchemaParseException) as exc: parse_schema(schema) assert "decimal precision must be a postive integer" in str(exc)
def test_writer_open_and_close(self): # Create and then close a temp file so we can manually open it later dst = tempfile.NamedTemporaryFile(delete=False) dst.close() schema = parse_schema(json.loads(self.SCHEMA_STRING)) sink = _create_avro_sink( 'some_avro_sink', schema, 'null', '.end', 0, None, 'application/x-avro') w = sink.open(dst.name) sink.close(w) os.unlink(dst.name)
def test_schema_expansion_3(): """https://github.com/fastavro/fastavro/issues/538""" references = { "com.namespace.dependencies.Dependency": { "name": "Dependency", "namespace": "com.namespace.dependencies", "type": "record", "fields": [{ "name": "sub_field_1", "type": "string" }], } } original_schema = { "name": "MasterSchema", "namespace": "com.namespace.master", "type": "record", "fields": [{ "name": "field_2", "type": "com.namespace.dependencies.Dependency" }], } expected_expanded_schema_fields = [{ "name": "field_2", "type": { "name": "Dependency", "namespace": "com.namespace.dependencies", "type": "record", "fields": [{ "name": "sub_field_1", "type": "string" }], }, }] assert isinstance(original_schema, dict) try: parsed_schema = parse_schema(original_schema, named_schemas=references) assert expected_expanded_schema_fields == expand_schema( parsed_schema)["fields"] except UnknownType: pytest.fail( "expand_schema raised UnknownType even though referenced type is part of named_schemas" )
def test_decimal_precision_is_greater_than_scale(): """https://github.com/fastavro/fastavro/issues/457""" schema = { "type": "record", "name": "test_scale_is_an_int", "fields": [{ "name": "field", "type": { "logicalType": "decimal", "precision": 5, "scale": 10, "type": "bytes", }, }], } with pytest.raises( SchemaParseException, match="decimal scale must be less than or equal to", ): parse_schema(schema)
def test_enum_symbols_validation__invalid(symbol): """https://github.com/fastavro/fastavro/issues/551""" invalid_schema = { "type": "record", "name": "my_schema", "fields": [{ "name": "enum_field", "type": { "name": "my_enum", "type": "enum", "symbols": [symbol], }, }], } with pytest.raises(SchemaParseException) as err: parse_schema(invalid_schema) assert ( str(err.value) == "Every symbol must match the regular expression [A-Za-z_][A-Za-z0-9_]*" )
def test_with_dependent_schema(): """Tests a schema with dependent schema https://github.com/fastavro/fastavro/issues/418""" dependency = { "type": "record", "name": "Dependency", "namespace": "test", "fields": [{ "name": "_name", "type": "string" }] } schema = { "type": "record", "name": "Test", "namespace": "test", "fields": [{ "name": "_name", "type": "string" }, { "name": "_dependency", "type": "Dependency" }] } records = [{'_name': 'parent', '_dependency': {'_name': 'child'}}] parse_schema(dependency) parse_schema(schema) new_records = roundtrip(schema, records) assert records == new_records
def test_named_type_cannot_be_redefined(): schema = { "type": "record", "namespace": "test.avro.training", "name": "SomeMessage", "fields": [{ "name": "is_error", "type": "boolean", "default": False, }, { "name": "outcome", "type": [{ "type": "record", "name": "SomeMessage", "fields": [], }, { "type": "record", "name": "ErrorRecord", "fields": [{ "name": "errors", "type": { "type": "map", "values": "string" }, "doc": "doc", }], }], }], } with pytest.raises( SchemaParseException, match="redefined named type: test.avro.training.SomeMessage", ): parse_schema(schema) schema = { "type": "record", "name": "SomeMessage", "fields": [{ "name": "field1", "type": { "type": "record", "name": "ThisName", "fields": [], }, }, { "name": "field2", "type": { "type": "enum", "name": "ThisName", "symbols": ["FOO", "BAR"], }, }], } with pytest.raises(SchemaParseException, match="redefined named type: ThisName"): parse_schema(schema) schema = { "type": "record", "name": "SomeMessage", "fields": [{ "name": "field1", "type": { "type": "record", "name": "ThatName", "fields": [], }, }, { "name": "field2", "type": { "type": "fixed", "name": "ThatName", "size": 8, }, }], } with pytest.raises(SchemaParseException, match="redefined named type: ThatName"): parse_schema(schema)
def test_unknown_type(): with pytest.raises(UnknownType): parse_schema({"type": "unknown"})
def __init__(self, methodName='runTest'): super(TestFastAvro, self).__init__(methodName) self.use_fastavro = True self.SCHEMA = parse_schema(json.loads(self.SCHEMA_STRING))