def test_avro_schema_to_mce_fields_record_with_two_fields(): schema = """ { "type": "record", "name": "some.event.name", "namespace": "not.relevant.namespace", "fields": [ { "name": "a", "type": "string", "doc": "some.doc" }, { "name": "b", "type": "string", "doc": "some.doc" } ] } """ fields = avro_schema_to_mce_fields(schema) expected_field_paths = [ "[version=2.0].[type=name].[type=string].a", "[version=2.0].[type=name].[type=string].b", ] assert_field_paths_match(fields, expected_field_paths)
def test_avro_schema_to_mce_fields_sample_events_with_different_field_types(self): EXAMPLES = [SCHEMA_WITH_MAP_TYPE_FIELD] for schema in EXAMPLES: fields = avro_schema_to_mce_fields(schema) self.assertEqual(1, len(fields))
def test_simple_record_with_primitive_types(): schema = """ { "type": "record", "name": "Simple", "namespace": "com.linkedin", "fields": [ {"name": "stringField", "type": "string", "doc": "string field"}, {"name": "booleanField", "type": "boolean" }, {"name": "intField", "type": "int" }, { "name": "enumField", "type": { "type": "enum", "name": "MyTestEnumField", "symbols": [ "TEST", "TEST1" ], "symbolDoc": { "TEST": "test enum", "TEST1": "test1 enum" } } } ] } """ fields = avro_schema_to_mce_fields(schema) expected_field_paths = [ "[version=2.0].[type=Simple].[type=string].stringField", "[version=2.0].[type=Simple].[type=boolean].booleanField", "[version=2.0].[type=Simple].[type=int].intField", "[version=2.0].[type=Simple].[type=enum].enumField", ] assert_field_paths_match(fields, expected_field_paths)
def test_avro_schema_to_mce_fields_with_nesting_across_records(): schema = """ [ { "type": "record", "name": "Address", "fields": [ {"name": "streetAddress", "type": "string"}, {"name": "city", "type": "string"} ] }, { "type": "record", "name": "Person", "fields": [ {"name": "firstname", "type": "string"}, {"name": "lastname", "type": "string" }, {"name": "address", "type": "Address"} ] } ] """ fields = avro_schema_to_mce_fields(schema) expected_field_paths = [ "[version=2.0].[type=union]", "[version=2.0].[type=union].[type=Address].[type=string].streetAddress", "[version=2.0].[type=union].[type=Address].[type=string].city", "[version=2.0].[type=union].[type=Person].[type=string].firstname", "[version=2.0].[type=union].[type=Person].[type=string].lastname", "[version=2.0].[type=union].[type=Person].[type=Address].address", ] assert_field_paths_match(fields, expected_field_paths)
def get_schema_fields_for_column( self, dataset_name: str, column: dict, pk_constraints: dict = None ) -> List[SchemaField]: fields = super().get_schema_fields_for_column( dataset_name, column, pk_constraints ) if isinstance(column["type"], (datatype.ROW, sqltypes.ARRAY, datatype.MAP)): assert len(fields) == 1 field = fields[0] # Get avro schema for subfields along with parent complex field avro_schema = self.get_avro_schema_from_data_type( column["type"], column["name"] ) newfields = schema_util.avro_schema_to_mce_fields( json.dumps(avro_schema), default_nullable=True ) # First field is the parent complex field newfields[0].nullable = field.nullable newfields[0].description = field.description newfields[0].isPartOfKey = field.isPartOfKey return newfields return fields
def test_simple_nested_record_with_a_string_field_for_key_schema(): schema = """ { "type": "record", "name": "SimpleNested", "namespace": "com.linkedin", "fields": [{ "name": "nestedRcd", "type": { "type": "record", "name": "InnerRcd", "fields": [{ "name": "aStringField", "type": "string" } ] } }] } """ fields = avro_schema_to_mce_fields(schema, True) expected_field_paths: List[str] = [ "[version=2.0].[key=True].[type=SimpleNested].[type=InnerRcd].nestedRcd", "[version=2.0].[key=True].[type=SimpleNested].[type=InnerRcd].nestedRcd.[type=string].aStringField", ] assert_field_paths_match(fields, expected_field_paths)
def test_nested_arrays(): schema = """ { "type": "record", "name": "NestedArray", "namespace": "com.linkedin", "fields": [{ "name": "ar", "type": { "type": "array", "items": { "type": "array", "items": [ "null", { "type": "record", "name": "Foo", "fields": [ { "name": "a", "type": "long" } ] } ] } } } ] } """ fields = avro_schema_to_mce_fields(schema) expected_field_paths: List[str] = [ "[version=2.0].[type=NestedArray].[type=array].[type=array].[type=Foo].ar", "[version=2.0].[type=NestedArray].[type=array].[type=array].[type=Foo].ar.[type=long].a", ] assert_field_paths_match(fields, expected_field_paths)
def test_logical_types(): schema: str = """ { "type": "record", "name": "test_logical_types", "fields": [ {"name": "decimal_logical", "type": "bytes", "logicalType": "decimal", "precision": 4, "scale": 2}, {"name": "uuid_logical", "type": "string", "logicalType": "uuid"}, {"name": "date_logical", "type": "int", "logicalType": "date"}, {"name": "time_millis_logical", "type": "int", "logicalType": "time-millis"}, {"name": "time_micros_logical", "type": "long", "logicalType": "time-micros"}, {"name": "timestamp_millis_logical", "type": "long", "logicalType": "timestamp-millis"}, {"name": "timestamp_micros_logical", "type": "long", "logicalType": "timestamp-micros"} ] } """ fields: List[SchemaField] = avro_schema_to_mce_fields(schema, is_key_schema=False) expected_field_paths: List[str] = [ "[version=2.0].[type=test_logical_types].[type=bytes].decimal_logical", "[version=2.0].[type=test_logical_types].[type=string].uuid_logical", "[version=2.0].[type=test_logical_types].[type=int].date_logical", "[version=2.0].[type=test_logical_types].[type=int].time_millis_logical", "[version=2.0].[type=test_logical_types].[type=long].time_micros_logical", "[version=2.0].[type=test_logical_types].[type=long].timestamp_millis_logical", "[version=2.0].[type=test_logical_types].[type=long].timestamp_micros_logical", ] assert_field_paths_match(fields, expected_field_paths)
def test_avro_schema_to_mce_fields_toplevel_isnt_a_record(self): examples = [SCHEMA_WITH_TOP_LEVEL_PRIMITIVE_FIELD] for schema in examples: fields = avro_schema_to_mce_fields(schema) self.assertEqual(1, len(fields))
def test_map_of_union_of_int_and_record_of_union(): schema = """ { "type": "record", "name": "MapSample", "namespace": "com.linkedin", "fields": [{ "name": "aMap", "type": { "type": "map", "values": [ "int", { "type": "record", "name": "Rcd", "fields": [{ "name": "aUnion", "type": ["null", "string", "int"] }] } ] } }] } """ fields = avro_schema_to_mce_fields(schema) expected_field_paths = [ "[version=2.0].[type=MapSample].[type=map].[type=union].aMap", "[version=2.0].[type=MapSample].[type=map].[type=union].[type=int].aMap", "[version=2.0].[type=MapSample].[type=map].[type=union].[type=Rcd].aMap", "[version=2.0].[type=MapSample].[type=map].[type=union].[type=Rcd].aMap.[type=union].aUnion", "[version=2.0].[type=MapSample].[type=map].[type=union].[type=Rcd].aMap.[type=union].[type=string].aUnion", "[version=2.0].[type=MapSample].[type=map].[type=union].[type=Rcd].aMap.[type=union].[type=int].aUnion", ] assert_field_paths_match(fields, expected_field_paths)
def test_avro_schema_to_mce_fields_record_with_two_fields(self): examples = [SCHEMA_WITH_TWO_FIELD_RECORD] for schema in examples: fields = avro_schema_to_mce_fields(schema) self.assertEqual(2, len(fields))
def test_recursive_avro(): schema = """ { "type": "record", "name": "Recursive", "namespace": "com.linkedin", "fields": [{ "name": "r", "type": { "type": "record", "name": "R", "fields": [ { "name" : "anIntegerField", "type" : "int" }, { "name": "aRecursiveField", "type": "com.linkedin.R"} ] } }] } """ fields = avro_schema_to_mce_fields(schema) expected_field_paths = [ "[version=2.0].[type=Recursive].[type=R].r", "[version=2.0].[type=Recursive].[type=R].r.[type=int].anIntegerField", "[version=2.0].[type=Recursive].[type=R].r.[type=R].aRecursiveField", ] assert_field_paths_match(fields, expected_field_paths)
def _get_schema_fields(self, topic: str, schema: Schema, is_key_schema: bool) -> List[SchemaField]: # Parse the schema and convert it to SchemaFields. fields: List[SchemaField] = [] if schema.schema_type == "AVRO": cleaned_str: str = self.get_schema_str_replace_confluent_ref_avro( schema) # "value.id" or "value.[type=string]id" fields = schema_util.avro_schema_to_mce_fields( cleaned_str, is_key_schema=is_key_schema) elif schema.schema_type == "PROTOBUF": imported_schemas: List[ ProtobufSchema] = self.get_schemas_from_confluent_ref_protobuf( schema) base_name: str = topic.replace(".", "_") fields = protobuf_util.protobuf_schema_to_mce_fields( ProtobufSchema( f"{base_name}-key.proto" if is_key_schema else f"{base_name}-value.proto", schema.schema_str, ), imported_schemas, is_key_schema=is_key_schema, ) else: self.report.report_warning( topic, f"Parsing kafka schema type {schema.schema_type} is currently not implemented", ) return fields
def get_schema_fields_for_column( self, dataset_name: str, column: Dict[Any, Any], pk_constraints: Optional[Dict[Any, Any]] = None, ) -> List[SchemaField]: fields = super().get_schema_fields_for_column(dataset_name, column, pk_constraints) if self._COMPLEX_TYPE.match(fields[0].nativeDataType) and isinstance( fields[0].type.type, NullTypeClass): assert len(fields) == 1 field = fields[0] # Get avro schema for subfields along with parent complex field avro_schema = self.get_avro_schema_from_native_data_type( field.nativeDataType, column["name"]) newfields = schema_util.avro_schema_to_mce_fields( json.dumps(avro_schema), default_nullable=True) # First field is the parent complex field newfields[0].nullable = field.nullable newfields[0].description = field.description newfields[0].isPartOfKey = field.isPartOfKey return newfields return fields
def test_avro_schema_to_mce_fields_with_default(): schema = SCHEMA_WITH_DEFAULT_VALUE fields = avro_schema_to_mce_fields(schema) assert len(fields) == 1 assert fields[0].description and "custom, default value" in fields[ 0].description
def test_avro_schema_to_mce_fields_toplevel_isnt_a_record(): schema = """ { "type": "string" } """ fields = avro_schema_to_mce_fields(schema) expected_field_paths = ["[version=2.0].[type=string]"] assert_field_paths_match(fields, expected_field_paths)
def test_ignore_exceptions(): malformed_schema: str = """ "name": "event_ts", "type": "long", "logicalType": "timestamp-millis", "tags": [ "business-timestamp" ] """ fields: List[SchemaField] = avro_schema_to_mce_fields(malformed_schema) assert not fields
def test_avro_schema_to_mce_fields_events_with_nullable_fields(self): examples = [ SCHEMA_WITH_OPTIONAL_FIELD_VIA_UNION_TYPE, SCHEMA_WITH_OPTIONAL_FIELD_VIA_UNION_TYPE_NULL_ISNT_FIRST_IN_UNION, SCHEMA_WITH_OPTIONAL_FIELD_VIA_PRIMITIVE_TYPE, ] for schema in examples: fields = avro_schema_to_mce_fields(schema) self.assertEqual(1, len(fields)) self.assertTrue(fields[0].nullable)
def test_avro_schema_to_mce_fields_events_with_nullable_fields(self): events = [ EXAMPLE_EVENT_OPTIONAL_FIELD_VIA_UNION_TYPE, EXAMPLE_EVENT_OPTIONAL_FIELD_VIA_UNION_TYPE_NULL_ISNT_FIRST_IN_UNION, EXAMPLE_EVENT_OPTIONAL_FIELD_VIA_PRIMITIVE_TYPE, ] for event in events: fields = avro_schema_to_mce_fields(event) self.assertEqual(1, len(fields)) self.assertTrue(fields[0].nullable)
def test_avro_sample_payment_schema_to_mce_fields_with_nesting(): schema = """ { "type": "record", "name": "Payment", "namespace": "some.event.namespace", "fields": [ {"name": "id", "type": "string"}, {"name": "amount", "type": "double"}, {"name": "name","type": "string","default": ""}, {"name": "phoneNumber", "type": [{ "type": "record", "name": "PhoneNumber", "fields": [{ "name": "areaCode", "type": "string", "default": "" }, { "name": "countryCode", "type": "string", "default": "" }, { "name": "prefix", "type": "string", "default": "" }, { "name": "number", "type": "string", "default": "" }] }, "null" ], "default": "null" } ] } """ fields = avro_schema_to_mce_fields(schema) expected_field_paths = [ "[version=2.0].[type=Payment].[type=string].id", "[version=2.0].[type=Payment].[type=double].amount", "[version=2.0].[type=Payment].[type=string].name", "[version=2.0].[type=Payment].[type=PhoneNumber].phoneNumber", "[version=2.0].[type=Payment].[type=PhoneNumber].phoneNumber.[type=string].areaCode", "[version=2.0].[type=Payment].[type=PhoneNumber].phoneNumber.[type=string].countryCode", "[version=2.0].[type=Payment].[type=PhoneNumber].phoneNumber.[type=string].prefix", "[version=2.0].[type=Payment].[type=PhoneNumber].phoneNumber.[type=string].number", ] assert_field_paths_match(fields, expected_field_paths)
def _extract_record(self, topic: str) -> MetadataChangeEvent: logger.debug(f"topic = {topic}") platform = "kafka" dataset_name = topic actor = "urn:li:corpuser:etl" sys_time = get_sys_time() dataset_snapshot = DatasetSnapshot( urn= f"urn:li:dataset:(urn:li:dataPlatform:{platform},{dataset_name},{self.source_config.env})", aspects=[], # we append to this list later on ) dataset_snapshot.aspects.append(Status(removed=False)) # Fetch schema from the registry. has_schema = True try: registered_schema = self.schema_registry_client.get_latest_version( topic + "-value") schema = registered_schema.schema except Exception as e: self.report.report_warning(topic, f"failed to get schema: {e}") has_schema = False # Parse the schema fields: List[SchemaField] = [] if has_schema and schema.schema_type == "AVRO": fields = schema_util.avro_schema_to_mce_fields(schema.schema_str) elif has_schema: self.report.report_warning( topic, f"unable to parse kafka schema type {schema.schema_type}") if has_schema: schema_metadata = SchemaMetadata( schemaName=topic, version=0, hash=str(schema._hash), platform=f"urn:li:dataPlatform:{platform}", platformSchema=KafkaSchema(documentSchema=schema.schema_str), fields=fields, created=AuditStamp(time=sys_time, actor=actor), lastModified=AuditStamp(time=sys_time, actor=actor), ) dataset_snapshot.aspects.append(schema_metadata) metadata_record = MetadataChangeEvent( proposedSnapshot=dataset_snapshot) return metadata_record
def _get_schema_fields(self, pulsar_topic: PulsarTopic, schema: PulsarSchema, is_key_schema: bool) -> List[SchemaField]: # Parse the schema and convert it to SchemaFields. fields: List[SchemaField] = [] if schema.schema_type == "AVRO" or schema.schema_type == "JSON": # Extract fields from schema and get the FQN for the schema fields = schema_util.avro_schema_to_mce_fields( schema.schema_str, is_key_schema=is_key_schema) else: self.report.report_warning( pulsar_topic.fullname, f"Parsing Pulsar schema type {schema.schema_type} is currently not implemented", ) return fields
def test_key_schema_handling(): """Tests key schema handling""" schema = """ { "type": "record", "name": "ABFooUnion", "namespace": "com.linkedin", "fields": [{ "name": "a", "type": [ { "type": "record", "name": "A", "fields": [{ "name": "f", "type": "string" } ] }, { "type": "record", "name": "B", "fields": [{ "name": "f", "type": "string" } ] }, { "type": "array", "items": { "type": "array", "items": [ "null", { "type": "record", "name": "Foo", "fields": [{ "name": "f", "type": "long" }] } ] } }] }] } """ fields: List[SchemaField] = avro_schema_to_mce_fields(schema, is_key_schema=True) expected_field_paths: List[str] = [ "[version=2.0].[key=True].[type=ABFooUnion].[type=union].a", "[version=2.0].[key=True].[type=ABFooUnion].[type=union].[type=A].a", "[version=2.0].[key=True].[type=ABFooUnion].[type=union].[type=A].a.[type=string].f", "[version=2.0].[key=True].[type=ABFooUnion].[type=union].[type=B].a", "[version=2.0].[key=True].[type=ABFooUnion].[type=union].[type=B].a.[type=string].f", "[version=2.0].[key=True].[type=ABFooUnion].[type=union].[type=array].[type=array].[type=Foo].a", "[version=2.0].[key=True].[type=ABFooUnion].[type=union].[type=array].[type=array].[type=Foo].a.[type=long].f", ] assret_field_paths_match(fields, expected_field_paths) for f in fields: assert f.isPartOfKey
def _get_schema_fields(self, topic: str, schema: Schema, is_key_schema: bool) -> List[SchemaField]: # Parse the schema and convert it to SchemaFields. fields: List[SchemaField] = [] if schema.schema_type == "AVRO": cleaned_str: str = self.get_schema_str_replace_confluent_ref_avro( schema) # "value.id" or "value.[type=string]id" fields = schema_util.avro_schema_to_mce_fields( cleaned_str, is_key_schema=is_key_schema) else: self.report.report_warning( topic, f"Parsing kafka schema type {schema.schema_type} is currently not implemented", ) return fields
def test_mce_avro_parses_okay(): """This test helps to exercise the complexity in parsing and catch unexpected regressions.""" schema = Path( os.path.join( os.path.dirname(__file__), "..", "..", "src", "datahub", "metadata", "schema.avsc", )).read_text() fields = avro_schema_to_mce_fields(schema) assert len(fields) # Ensure that all the paths corresponding to the AVRO fields are unique. assert_field_paths_are_unique(fields) log_field_paths(fields)
def test_needs_disambiguation_nested_union_of_records_with_same_field_name(): schema = """ { "type": "record", "name": "ABFooUnion", "namespace": "com.linkedin", "fields": [{ "name": "a", "type": [ { "type": "record", "name": "A", "fields": [{ "name": "f", "type": "string" } ] }, { "type": "record", "name": "B", "fields": [{ "name": "f", "type": "string" } ] }, { "type": "array", "items": { "type": "array", "items": [ "null", { "type": "record", "name": "Foo", "fields": [{ "name": "f", "type": "long" }] } ] } }] }] } """ fields = avro_schema_to_mce_fields(schema) expected_field_paths: List[str] = [ "[version=2.0].[type=ABFooUnion].[type=union].a", "[version=2.0].[type=ABFooUnion].[type=union].[type=A].a", "[version=2.0].[type=ABFooUnion].[type=union].[type=A].a.[type=string].f", "[version=2.0].[type=ABFooUnion].[type=union].[type=B].a", "[version=2.0].[type=ABFooUnion].[type=union].[type=B].a.[type=string].f", "[version=2.0].[type=ABFooUnion].[type=union].[type=array].[type=array].[type=Foo].a", "[version=2.0].[type=ABFooUnion].[type=union].[type=array].[type=array].[type=Foo].a.[type=long].f", ] assert_field_paths_match(fields, expected_field_paths)
def test_logical_types(): schema: str = """ { "type": "record", "name": "test_logical_types", "fields": [ {"name": "decimal_logical", "type": "bytes", "logicalType": "decimal", "precision": 4, "scale": 2}, {"name": "uuid_logical", "type": "string", "logicalType": "uuid"}, {"name": "date_logical", "type": "int", "logicalType": "date"}, {"name": "time_millis_logical", "type": "int", "logicalType": "time-millis"}, {"name": "time_micros_logical", "type": "long", "logicalType": "time-micros"}, {"name": "timestamp_millis_logical", "type": "long", "logicalType": "timestamp-millis"}, {"name": "timestamp_micros_logical", "type": "long", "logicalType": "timestamp-micros"} ] } """ fields: List[SchemaField] = avro_schema_to_mce_fields(schema, is_key_schema=False) # validate field paths expected_field_paths: List[str] = [ "[version=2.0].[type=test_logical_types].[type=bytes].decimal_logical", "[version=2.0].[type=test_logical_types].[type=string].uuid_logical", "[version=2.0].[type=test_logical_types].[type=int].date_logical", "[version=2.0].[type=test_logical_types].[type=int].time_millis_logical", "[version=2.0].[type=test_logical_types].[type=long].time_micros_logical", "[version=2.0].[type=test_logical_types].[type=long].timestamp_millis_logical", "[version=2.0].[type=test_logical_types].[type=long].timestamp_micros_logical", ] assert_field_paths_match(fields, expected_field_paths) # validate field types. expected_types: List[Type] = [ NumberTypeClass, StringTypeClass, DateTypeClass, TimeTypeClass, TimeTypeClass, TimeTypeClass, TimeTypeClass, ] assert expected_types == [type(field.type.type) for field in fields]
def get_schema_fields_for_hive_column( hive_column_name: str, hive_column_type: str, description: Optional[str] = None, default_nullable: bool = False, is_part_of_key: bool = False, ) -> List[SchemaField]: avro_schema_json = get_avro_schema_for_hive_column( hive_column_name=hive_column_name, hive_column_type=hive_column_type) schema_fields = avro_schema_to_mce_fields( avro_schema_string=json.dumps(avro_schema_json), default_nullable=default_nullable, ) assert schema_fields if HiveColumnToAvroConverter.is_primitive_hive_type(hive_column_type): # Primitive avro schema does not have any field names. Append it to fieldPath. schema_fields[0].fieldPath += f".{hive_column_name}" if description: schema_fields[0].description = description schema_fields[0].isPartOfKey = is_part_of_key return schema_fields
def test_avro_schema_namespacing(): schema = """ { "type": "record", "name": "name", "namespace": "should.not.show.up.namespace", "fields": [ { "name": "aStringField", "type": "string", "doc": "some docs", "default": "this is custom, default value" } ] } """ fields = avro_schema_to_mce_fields(schema) expected_field_paths = [ "[version=2.0].[type=name].[type=string].aStringField", ] assert_field_paths_match(fields, expected_field_paths)
def test_union_with_nested_record_of_union(): schema = """ { "type": "record", "name": "UnionSample", "namespace": "com.linkedin", "fields": [ { "name": "aUnion", "type": [ "boolean", { "type": "record", "name": "Rcd", "fields": [ { "name": "aNullableStringField", "type": ["null", "string"] } ] } ] } ] } """ fields = avro_schema_to_mce_fields(schema) expected_field_paths = [ "[version=2.0].[type=UnionSample].[type=union].aUnion", "[version=2.0].[type=UnionSample].[type=union].[type=boolean].aUnion", "[version=2.0].[type=UnionSample].[type=union].[type=Rcd].aUnion", "[version=2.0].[type=UnionSample].[type=union].[type=Rcd].aUnion.[type=string].aNullableStringField", ] assert_field_paths_match(fields, expected_field_paths) assert isinstance(fields[3].type.type, StringTypeClass) assert fields[0].nativeDataType == "union" assert fields[1].nativeDataType == "boolean" assert fields[2].nativeDataType == "Rcd" assert fields[3].nativeDataType == "string"