def test_schema_equivilence(load_avsc):
    schema_str1 = load_avsc('basic_schema.avsc')
    schema_str2 = load_avsc('basic_schema.avsc')

    schema = Schema(schema_str1, 'AVRO')
    schema2 = Schema(schema_str2, 'AVRO')

    assert schema.__eq__(schema2)
    assert schema == schema2
    assert schema_str1.__eq__(schema_str2)
    assert schema_str1 == schema_str2
 def new_get_latest_version(subject_name: str) -> RegisteredSchema:
     return RegisteredSchema(
         schema_id="schema_id_1",
         schema=Schema(schema_str=schema_str_ref, schema_type="AVRO"),
         subject="test",
         version=1,
     )
def test_register_schema(mock_schema_registry, load_avsc):
    conf = {'url': TEST_URL}
    sr = mock_schema_registry(conf)
    schema = Schema(load_avsc('basic_schema.avsc'), schema_type='AVRO')

    result = sr.register_schema('test-key', schema)
    assert result == find_schema_id('test-key')
def test_get_schema(mock_schema_registry, load_avsc):
    conf = {'url': TEST_URL}
    sr = mock_schema_registry(conf)

    schema = Schema(load_avsc(mock_schema_registry.SCHEMA), schema_type='AVRO')
    schema2 = sr.get_schema(47)

    assert cmp_schema(schema, schema2)
def test_register_schema_incompatible(mock_schema_registry, load_avsc):
    conf = {'url': TEST_URL}
    sr = mock_schema_registry(conf)
    schema = Schema(load_avsc('basic_schema.avsc'), schema_type='AVRO')

    with pytest.raises(SchemaRegistryError, match="Incompatible Schema") as e:
        sr.register_schema('conflict', schema)

    assert e.value.http_status_code == 409
    assert e.value.error_code == -1
def test_register_schema_invalid(mock_schema_registry, load_avsc):
    conf = {'url': TEST_URL}
    sr = mock_schema_registry(conf)
    schema = Schema(load_avsc('invalid_schema.avsc'), schema_type='AVRO')

    with pytest.raises(SchemaRegistryError, match="Invalid Schema") as e:
        sr.register_schema('invalid', schema)

    assert e.value.http_status_code == 422
    assert e.value.error_code == 42201
def test_get_registration_schema_not_found(mock_schema_registry, load_avsc):
    conf = {'url': TEST_URL}
    sr = mock_schema_registry(conf)

    subject = 'schemanotfound'
    schema = Schema(load_avsc(mock_schema_registry.SCHEMA), schema_type='AVRO')

    with pytest.raises(SchemaRegistryError, match="Schema not found") as e:
        sr.lookup_schema(subject, schema)
    assert e.value.http_status_code == 404
    assert e.value.error_code == 40403
def test_get_version(mock_schema_registry, load_avsc):
    conf = {'url': TEST_URL}
    sr = mock_schema_registry(conf)

    subject = "get_version"
    version = 3
    schema = Schema(load_avsc(mock_schema_registry.SCHEMA), schema_type='AVRO')

    result = sr.get_version(subject, version)
    assert result.subject == subject
    assert result.version == version
    assert cmp_schema(result.schema, schema)
    assert result.schema_id == find_schema_id(subject)
def test_get_registration(mock_schema_registry, load_avsc):
    conf = {'url': TEST_URL}
    sr = mock_schema_registry(conf)

    subject = 'get_registration'
    schema = Schema(load_avsc(mock_schema_registry.SCHEMA), schema_type='AVRO')

    response = sr.lookup_schema(subject, schema)

    assert response.subject == subject
    assert response.version == mock_schema_registry.VERSION
    assert response.schema_id == find_schema_id(subject)
    assert cmp_schema(response.schema, schema)
def test_register_schema_cache(mock_schema_registry, load_avsc):
    conf = {'url': TEST_URL}
    sr = mock_schema_registry(conf)
    schema = load_avsc('basic_schema.avsc')

    count_before = sr.counter['POST'].get('/subjects/test-cache/versions', 0)

    # Caching only starts after the first response is handled.
    # A possible improvement would be to add request caching to the http client
    # to catch in-flight requests as well.
    sr.register_schema('test-cache', Schema(schema, 'AVRO'))

    fs = []
    with ThreadPoolExecutor(max_workers=10) as executor:
        for _ in range(0, 1000):
            fs.append(executor.submit(sr.register_schema, 'test-cache',
                                      schema))
    wait(fs)

    count_after = sr.counter['POST'].get('/subjects/test-cache/versions')

    assert count_after - count_before == 1
    def test_get_schema_str_replace_confluent_ref_avro(self):
        schema_str_orig = """
        {
          "fields": [
            {
              "name": "my_field1",
              "type": "TestTopic1"
            }
          ],
          "name": "TestTopic1Val",
          "namespace": "io.acryl",
          "type": "record"
        }
        """
        schema_str_ref = """
        {
          "doc": "Sample schema to help you get started.",
          "fields": [
            {
              "doc": "The int type is a 32-bit signed integer.",
              "name": "my_field1",
              "type": "int"
            }
          ],
          "name": "TestTopic1",
          "namespace": "io.acryl",
          "type": "record"
        }
    """

        schema_str_final = (
            """
    {
      "fields": [
        {
          "name": "my_field1",
          "type": """
            + schema_str_ref
            + """
            }
          ],
          "name": "TestTopic1Val",
          "namespace": "io.acryl",
          "type": "record"
        }
        """
        )

        kafka_source_config = KafkaSourceConfig.parse_obj(
            {
                "connection": {
                    "bootstrap": "localhost:9092",
                    "schema_registry_url": "http://localhost:8081",
                },
            }
        )
        confluent_schema_registry = ConfluentSchemaRegistry.create(
            kafka_source_config, KafkaSourceReport()
        )

        def new_get_latest_version(subject_name: str) -> RegisteredSchema:
            return RegisteredSchema(
                schema_id="schema_id_1",
                schema=Schema(schema_str=schema_str_ref, schema_type="AVRO"),
                subject="test",
                version=1,
            )

        with patch.object(
            confluent_schema_registry.schema_registry_client,
            "get_latest_version",
            new_get_latest_version,
        ):
            schema_str = confluent_schema_registry.get_schema_str_replace_confluent_ref_avro(
                # The external reference would match by name.
                schema=Schema(
                    schema_str=schema_str_orig,
                    schema_type="AVRO",
                    references=[
                        dict(name="TestTopic1", subject="schema_subject_1", version=1)
                    ],
                )
            )
            assert schema_str == ConfluentSchemaRegistry._compact_schema(
                schema_str_final
            )

        with patch.object(
            confluent_schema_registry.schema_registry_client,
            "get_latest_version",
            new_get_latest_version,
        ):
            schema_str = confluent_schema_registry.get_schema_str_replace_confluent_ref_avro(
                # The external reference would match by subject.
                schema=Schema(
                    schema_str=schema_str_orig,
                    schema_type="AVRO",
                    references=[
                        dict(name="schema_subject_1", subject="TestTopic1", version=1)
                    ],
                )
            )
            assert schema_str == ConfluentSchemaRegistry._compact_schema(
                schema_str_final
            )
Example #12
0
    def test_kafka_source_workunits_schema_registry_subject_name_strategies(
            self, mock_kafka_consumer, mock_schema_registry_client):
        # Setup the topic to key/value schema mappings for all types of schema registry subject name strategies.
        # <key=topic_name, value=(<key_schema>,<value_schema>)
        topic_subject_schema_map: Dict[str, Tuple[RegisteredSchema, RegisteredSchema]] = {
            # TopicNameStrategy is used for subject
            "topic1": (
                RegisteredSchema(
                    schema_id="schema_id_2",
                    schema=Schema(
                        schema_str=
                        '{"type":"record", "name":"Topic1Key", "namespace": "test.acryl", "fields": [{"name":"t1key", "type": "string"}]}',
                        schema_type="AVRO",
                    ),
                    subject="topic1-key",
                    version=1,
                ),
                RegisteredSchema(
                    schema_id="schema_id_1",
                    schema=Schema(
                        schema_str=
                        '{"type":"record", "name":"Topic1Value", "namespace": "test.acryl", "fields": [{"name":"t1value", "type": "string"}]}',
                        schema_type="AVRO",
                    ),
                    subject=
                    "topic1-value",
                    version=
                    1,
                ),
            ),
            # RecordNameStrategy is used for subject
            "topic2":
            (
                RegisteredSchema(
                    schema_id="schema_id_3",
                    schema=Schema(
                        schema_str=
                        '{"type":"record", "name":"Topic2Key", "namespace": "test.acryl", "fields": [{"name":"t2key", "type": "string"}]}',
                        schema_type="AVRO",
                    ),
                    subject="test.acryl.Topic2Key",
                    version=1,
                ),
                RegisteredSchema(
                    schema_id="schema_id_4",
                    schema=Schema(
                        schema_str=
                        '{"type":"record", "name":"Topic2Value", "namespace": "test.acryl", "fields": [{"name":"t2value", "type": "string"}]}',
                        schema_type="AVRO",
                    ),
                    subject="test.acryl.Topic2Value",
                    version=1,
                ),
            ),
            # TopicRecordNameStrategy is used for subject
            "topic3": (
                RegisteredSchema(
                    schema_id="schema_id_4",
                    schema=Schema(
                        schema_str=
                        '{"type":"record", "name":"Topic3Key", "namespace": "test.acryl", "fields": [{"name":"t3key", "type": "string"}]}',
                        schema_type="AVRO",
                    ),
                    subject="topic3-test.acryl.Topic3Key-key",
                    version=1,
                ),
                RegisteredSchema(
                    schema_id="schema_id_5",
                    schema=Schema(
                        schema_str=
                        '{"type":"record", "name":"Topic3Value", "namespace": "test.acryl", "fields": [{"name":"t3value", "type": "string"}]}',
                        schema_type="AVRO",
                    ),
                    subject="topic3-test.acryl.Topic3Value-value",
                    version=1,
                ),
            ),
        }

        # Mock the kafka consumer
        mock_kafka_instance = mock_kafka_consumer.return_value
        mock_cluster_metadata = MagicMock()
        mock_cluster_metadata.topics = list(topic_subject_schema_map.keys())
        mock_cluster_metadata.topics.append("schema_less_topic")
        mock_kafka_instance.list_topics.return_value = mock_cluster_metadata

        # Mock the schema registry client
        # - mock get_subjects: all subjects in topic_subject_schema_map
        mock_schema_registry_client.return_value.get_subjects.return_value = [
            v.subject for v in chain(*topic_subject_schema_map.values())
        ]

        # - mock get_latest_version
        def mock_get_latest_version(
                subject_name: str) -> Optional[RegisteredSchema]:
            for registered_schema in chain(*topic_subject_schema_map.values()):
                if registered_schema.subject == subject_name:
                    return registered_schema
            return None

        mock_schema_registry_client.return_value.get_latest_version = (
            mock_get_latest_version)

        # Test the kafka source
        source_config = {
            "connection": {
                "bootstrap": "localhost:9092"
            },
            # Setup the topic_subject_map for topic2 which uses RecordNameStrategy
            "topic_subject_map": {
                "topic2-key": "test.acryl.Topic2Key",
                "topic2-value": "test.acryl.Topic2Value",
            },
        }
        ctx = PipelineContext(run_id="test")
        kafka_source = KafkaSource.create(source_config, ctx)
        workunits = list(kafka_source.get_workunits())

        mock_kafka_consumer.assert_called_once()
        mock_kafka_instance.list_topics.assert_called_once()
        assert len(workunits) == 8
        i: int = -1
        for wu in workunits:
            assert isinstance(wu, MetadataWorkUnit)
            if not isinstance(wu.metadata, MetadataChangeEvent):
                continue
            mce: MetadataChangeEvent = wu.metadata
            i += 1

            if i < len(topic_subject_schema_map.keys()):
                # First 3 workunits (topics) must have schemaMetadata aspect
                assert isinstance(mce.proposedSnapshot.aspects[1],
                                  SchemaMetadataClass)
                schemaMetadataAspect: SchemaMetadataClass = (
                    mce.proposedSnapshot.aspects[1])
                assert isinstance(schemaMetadataAspect.platformSchema,
                                  KafkaSchemaClass)
                # Make sure the schema name is present in topic_subject_schema_map.
                assert schemaMetadataAspect.schemaName in topic_subject_schema_map
                # Make sure the schema_str matches for the key schema.
                assert (
                    schemaMetadataAspect.platformSchema.keySchema ==
                    topic_subject_schema_map[
                        schemaMetadataAspect.schemaName][0].schema.schema_str)
                # Make sure the schema_str matches for the value schema.
                assert (
                    schemaMetadataAspect.platformSchema.documentSchema ==
                    topic_subject_schema_map[
                        schemaMetadataAspect.schemaName][1].schema.schema_str)
                # Make sure we have 2 fields, one from the key schema & one from the value schema.
                assert len(schemaMetadataAspect.fields) == 2
            else:
                # Last topic('schema_less_topic') has no schema defined in the registry.
                # The schemaMetadata aspect should not be present for this.
                for aspect in mce.proposedSnapshot.aspects:
                    assert not isinstance(aspect, SchemaMetadataClass)
    def test_get_schema_str_replace_confluent_ref_avro(self):

        # References external schema 'TestTopic1' in the definition of 'my_field1' field.
        schema_str_orig = """
{
  "fields": [
    {
      "name": "my_field1",
      "type": "TestTopic1"
    }
  ],
  "name": "TestTopic1Val",
  "namespace": "io.acryl",
  "type": "record"
}
"""
        schema_str_ref = """
{
  "doc": "Sample schema to help you get started.",
  "fields": [
    {
      "doc": "The int type is a 32-bit signed integer.",
      "name": "my_field1",
      "type": "int"
    }
  ],
  "name": "TestTopic1",
  "namespace": "io.acryl",
  "type": "record"
}
"""

        schema_str_final = ("""
{
  "fields": [
    {
      "name": "my_field1",
      "type": """ + schema_str_ref + """
    }
  ],
  "name": "TestTopic1Val",
  "namespace": "io.acryl",
  "type": "record"
}
""")

        ctx = PipelineContext(run_id="test")
        kafka_source = KafkaSource.create(
            {
                "connection": {
                    "bootstrap": "localhost:9092"
                },
            },
            ctx,
        )

        def new_get_latest_version(subject_name: str) -> RegisteredSchema:
            return RegisteredSchema(
                schema_id="schema_id_1",
                schema=Schema(schema_str=schema_str_ref, schema_type="AVRO"),
                subject="test",
                version=1,
            )

        with patch.object(
                kafka_source.schema_registry_client,
                "get_latest_version",
                new_get_latest_version,
        ):
            schema_str = kafka_source.get_schema_str_replace_confluent_ref_avro(
                # The external reference would match by name.
                schema=Schema(
                    schema_str=schema_str_orig,
                    schema_type="AVRO",
                    references=[
                        dict(name="TestTopic1",
                             subject="schema_subject_1",
                             version=1)
                    ],
                ))
            assert schema_str == KafkaSource._compact_schema(schema_str_final)

        with patch.object(
                kafka_source.schema_registry_client,
                "get_latest_version",
                new_get_latest_version,
        ):
            schema_str = kafka_source.get_schema_str_replace_confluent_ref_avro(
                # The external reference would match by subject.
                schema=Schema(
                    schema_str=schema_str_orig,
                    schema_type="AVRO",
                    references=[
                        dict(name="schema_subject_1",
                             subject="TestTopic1",
                             version=1)
                    ],
                ))
            assert schema_str == KafkaSource._compact_schema(schema_str_final)