Beispiel #1
0
    def test_inspect_inferred(self) -> None:
        # arrange
        samples = DataLoader.load_samples()

        # act
        schema_definition = SchemaDefinition.create(DataLoader.load_schema())
        result = self.inspector.inspect(samples, schema_definition)

        # assert
        self.assertEqual(1.0, result.attribute_integrity)
        self.assertEqual(.0, result.attribute_specification)
        self.assertEqual(.5, result.attribute_quality_index)
Beispiel #2
0
    def test_inspect_with_inferred_schemas(self):
        # arrange
        schema = DataLoader.load_schema_with_name("schema_registry_json.json")
        schema_definition = SchemaDefinition.create(schema, True)
        samples = DataLoader.load_samples()

        # act
        result = self.inspector.inspect(samples, schema_definition)

        # assert
        self.assertEqual(1.0, result.attribute_integrity)
        self.assertEqual(.0, result.attribute_specification)
        self.assertEqual(.5, result.attribute_quality_index)
    def test_convert_with_nested_expectations(self):
        # arrange
        schema = DataLoader.load_schema_with_name(
            "schema_nested_expectation_json.json")
        expected_schema = DataLoader.load_schema_with_name(
            "schema_nested_expectation_result_json.json")

        # act
        result = self.parser.convert_expectations(
            SchemaDefinition.create(schema, False))

        # assert
        self.assertStingEqualAsDict(result.schema_content, expected_schema)
Beispiel #4
0
    def test_inspect_with_both_schema_formats(self):
        # arrange
        schema_json = DataLoader.load_schema_with_name("schema_diff_json.json")
        schema_avro = DataLoader.load_schema_with_name("schema_diff_avro.json")

        samples = DataLoader.load_samples()

        # act
        result_json = self.inspector.inspect(
            samples, SchemaDefinition.create(schema_json, False))
        result_avro = self.inspector.inspect(
            samples, SchemaDefinition.create(schema_avro, False))

        # assert
        self.assertEqual(result_json, result_avro)
Beispiel #5
0
    def test_integrity_on_attribute_level_with_not_specified_partial_field(
            self) -> None:
        # arrange
        samples = [
            {
                "random_int": 1002,
                "random_string": 1
            },
            {
                "random_int": 1003,
                "random_string": 2
            },
            {
                "random_int": 1004
            },
        ]

        schema_definition = DataLoader.expand_schema(
            [("random_int", "integer")], [])

        # act
        result = self.inspector.inspect_attributes(samples, schema_definition)

        # assert
        attribute_details = result.attribute_details
        self.assertTrue('random_string' in attribute_details.keys(),
                        "Missing integrity for attribute random_string")
        self.assertAlmostEqual(
            1, attribute_details['random_string'].attribute_integrity, 3,
            "Integrity of random_string is not correct")
Beispiel #6
0
    def test_specification_on_attribute_level_with_partial_expectations(
            self) -> None:
        # arrange
        samples = [
            {
                "random_int": 1002,
                "random_string": 1
            },
            {
                "random_int": 1003,
                "random_string": 2
            },
        ]

        schema_definition = DataLoader.expand_schema(
            [("random_int", "integer"),
             ("random_string", "string")], [], {"random_int": {
                 "minimum": 0
             }})

        # act
        result = self.inspector.inspect_attributes(samples, schema_definition)

        # assert
        attribute_details = result.attribute_details
        self.assertTrue('random_int' in attribute_details.keys())
        self.assertEqual(
            .75, attribute_details['random_int'].attribute_specification)
        self.assertEqual(
            .5, attribute_details['random_string'].attribute_specification)
Beispiel #7
0
    def test_specification_on_attribute_level_with_missing_specification(
            self) -> None:
        # arrange
        samples = [
            {
                "random_int": 1002,
                "random_string": 1
            },
            {
                "random_int": 1003,
                "random_string": 2
            },
        ]

        schema_definition = DataLoader.expand_schema(
            [("random_int", "integer")], [])

        # act
        result = self.inspector.inspect_attributes(samples, schema_definition)

        # assert
        attribute_details = result.attribute_details
        self.assertTrue('random_string' in attribute_details.keys())
        self.assertEqual(
            0.0, attribute_details['random_string'].attribute_specification)
Beispiel #8
0
    def test_inspect_with_non_unique_types_does_not_throw_exception(
            self) -> None:
        # arrange
        samples = [
            {
                "random_int": 1002
            },
            {
                "random_int": "1003"
            },
            {
                "random_int": "1004"
            },
        ]

        schema_definition = DataLoader.expand_schema(
            [("random_int", "integer")], [],
            {"random_int": {
                "minimum": 0,
                "maximum": 100
            }})

        # act
        result = self.inspector.inspect(samples, schema_definition)

        # assert
        attribute_details = result.attribute_details
        self.assertAlmostEquals(
            (1 / 3), attribute_details['random_int'].attribute_integrity, 3)
Beispiel #9
0
    def test_inspect_with_missing_field(self):
        # arrange
        samples = [
            {
                "random_other": "other"
            },
        ]

        schema_definition = DataLoader.expand_schema(
            [("random_int", "integer")], ["random_int"])

        # act
        result = self.inspector.inspect_attributes(samples, schema_definition)

        # assert
        expected_specification = (0 + 1) / 2
        expected_integrity = (1 + 0) / 2
        self.assertEqual(expected_specification,
                         result.attribute_specification,
                         "Attribute specification is not correct")
        self.assertEqual(expected_integrity, result.attribute_integrity,
                         "Attribute integrity is not correct")
        self.assertEqual((expected_specification + expected_integrity) / 2,
                         result.attribute_quality_index,
                         "Attribute quality is not correct")
Beispiel #10
0
    def test_quality_with_complete_specification(self) -> None:
        # arrange
        samples = [
            {
                "random_int": 1,
                "random_string": "foo"
            },  # random_string does not match
            {
                "random_int": 2,
                "random_string": "bar"
            }
        ]

        schema_definition = DataLoader.expand_schema(
            [("random_string", "string"), ("random_int", "number")], [], {
                "random_string": {
                    "pattern": "bar"
                },
                "random_int": {
                    "minimum": 0,
                    "maximum": 100
                }
            })

        # act
        result = self.inspector.inspect_attributes(samples, schema_definition)

        # assert
        self.assertEqual(.75, result.attribute_integrity)
        self.assertEqual(1.0, result.attribute_specification)
        self.assertEqual(.875, result.attribute_quality_index)
Beispiel #11
0
    def test_inspect_with_multiple_expectations_asyncapi_style_json(self):
        # arrange
        schema = DataLoader.load_schema_with_name(
            "schema_expectation_asyncapi_style_json.json")

        samples = [
            {
                'random_integer': 1,
                'random_string': 'id_1'
            },
            {
                'random_integer': 2,
                'random_string': 'foo'
            },  # no match (string)
            {
                'random_integer': 3,
                'random_string': 'id_3'
            },
            {
                'random_integer': 4,
                'random_string': 'id_4'
            },  # no match (integer)
            {
                'random_integer': 5,
                'random_string': 'foo'
            },  # no match (integer, string)
        ]

        # act
        metrics = self.inspector.inspect(
            samples, SchemaDefinition.create(schema, False))

        # assert
        self.assertAlmostEqual(6 / 10, metrics.attribute_integrity, 3)
    def test_create_avro_parser(self):
        # arrange
        definition = SchemaDefinition.create(DataLoader.load_schema(), False)

        # act
        parser = SchemaParserFactory.create(definition)

        # assert
        self.assertIsInstance(parser, AvroSchemaParser)
Beispiel #13
0
    def test_integrity_with_wrong_type(self) -> None:
        # arrange
        samples, schema = DataLoader.create_dummy_samples()
        # noinspection PyTypeChecker
        samples[0]['random_string'] = 123

        # act
        result = self.inspector.inspect_attributes(samples, schema)

        # assert
        self.assertEqual(0.5, result.attribute_integrity)
Beispiel #14
0
    def test_integrity_with_missing_not_required(self) -> None:
        # arrange
        samples = [{"random_int": 1}, {"random_int": None}, {"random_int": 2}]

        schema_definition = DataLoader.expand_schema(
            [("random_int", "integer")], [])

        # act
        result = self.inspector.inspect_attributes(samples, schema_definition)

        # assert
        self.assertEqual(1.0, result.attribute_integrity)
Beispiel #15
0
    def test_specification_from_toeggelomat(self):
        # arrange
        samples = DataLoader.load_samples_from_file("samples_toeggelomat.json")

        # act
        schema = DataLoader.load_schema_with_name("schema_toeggelomat.json")
        result = self.inspector.inspect(samples,
                                        SchemaDefinition.create(schema, False))

        # assert
        self.assertEqual(53, len(result.attribute_details.keys()),
                         "There should be 53 keys in the dictionary")
        for attribute_metric in result.attribute_details.keys():
            self.assertEqual(
                1.0, result.attribute_details[attribute_metric].
                attribute_specification,
                f"Attribute specification must be 100% ({attribute_metric})")
            self.assertEqual(
                1.0,
                result.attribute_details[attribute_metric].attribute_integrity,
                f"Attribute integrity must be 100% ({attribute_metric})")
Beispiel #16
0
    def test_integrity_with_float_as_int(self) -> None:
        # arrange
        samples = [{"random_int": "10000001.023"}]

        schema_definition = DataLoader.expand_schema(
            [("random_int", "integer")], [])

        # act
        result = self.inspector.inspect_attributes(samples, schema_definition)

        # assert
        self.assertEqual(0.0, result.attribute_integrity)
Beispiel #17
0
    def test_integrity_without_provided_schema(self) -> None:
        # arrange
        samples, _ = DataLoader.create_dummy_samples()

        # act
        empty_schema = SchemaDefinition.empty()
        result = self.inspector.inspect(samples, empty_schema)

        # assert
        self.assertEqual(1.0, result.attribute_integrity)
        self.assertEqual(.0, result.attribute_specification)
        self.assertEqual(.5, result.attribute_quality_index)
Beispiel #18
0
    def test_integrity_without_specified_required_field(self) -> None:
        # arrange
        samples = [{"random_int": 1}, {"random_int": 2}, {"random_int": 3}]

        schema_definition = DataLoader.expand_schema(
            [("random_int", "integer"),
             ("random_string", "string")], ["random_string"])

        # act
        result = self.inspector.inspect_attributes(samples, schema_definition)

        # assert
        self.assertEqual(.5, result.attribute_integrity)
Beispiel #19
0
    def test_load_required_types_for_deeply_nested_schema(self):
        # arrange
        schema = DataLoader.load_schema_with_name(
            "schema_registry_avro_complex.json")
        schema_obj = json.loads(schema)

        # act
        type_definitions, _ = self.parser.load_required_types_from_schema(
            schema_obj)

        # assert
        self.assertListEqual([
            "complex/subtypeString", "complex/subtypeComplex/subtypeNumber",
            "simpleNumber"
        ], type_definitions)
Beispiel #20
0
    def test_integrity_with_negative_as_string(self) -> None:
        # arrange
        samples = [{"random_int": "-10000"}]

        schema_definition = DataLoader.expand_schema(
            [("random_int", "integer")], [])

        # act
        result = self.inspector.inspect_attributes(samples, schema_definition)

        # assert
        self.assertEqual(
            .0, result.attribute_integrity,
            "Attribute integrity must be 0% (even if not required, a "
            "specified value needs to be correct).")
    def test_load_required_types_for_deeply_nested_schema(self):
        # arrange
        schema = DataLoader.load_schema_with_name(
            "schema_inferred_complex.json")
        schema_obj = json.loads(schema)

        # act
        type_definitions, _ = JsonSchemaParser(
        ).load_required_types_from_schema(schema_obj)

        # assert
        # assert
        self.assertListEqual(type_definitions, [
            "base", "complex/type1number",
            "complex/type3complex/subtype1number"
        ])
Beispiel #22
0
    def test_quality_without_specification(self):
        # arrange
        samples = [{
            "random_int": 1,
            "random_string": "foo"
        }, {
            "random_int": 2,
            "random_string": "bar"
        }]

        schema_definition = DataLoader.expand_schema([], [])

        # act
        result = self.inspector.inspect_attributes(samples, schema_definition)

        # assert
        self.assertEqual(.5, result.attribute_quality_index)
Beispiel #23
0
    def test_inspect_with_unspecified_field(self):
        # arrange
        samples = [
            {
                "random_int": 1
            },
        ]

        schema_definition = DataLoader.expand_schema([], [])

        # act
        result = self.inspector.inspect_attributes(samples, schema_definition)

        # assert
        self.assertEqual(0, result.attribute_specification)
        self.assertEqual(1, result.attribute_integrity)
        self.assertEqual(.5, result.attribute_quality_index)
Beispiel #24
0
    def test_specification_with_complete_specification(self) -> None:
        # arrange
        samples = [{
            "random_int": 1,
            "random_string": "foo"
        }, {
            "random_int": 2,
            "random_string": "bar"
        }]

        schema_definition = DataLoader.expand_schema(
            [("random_int", "integer"), ("random_string", "string")],
            ["random_string", "random_int"],
        )

        # act
        result = self.inspector.inspect_attributes(samples, schema_definition)

        # assert
        self.assertEqual(.5, result.attribute_specification)
Beispiel #25
0
    def test_quality_with_partial_specification(self) -> None:
        # arrange
        samples = [{
            "random_int": 1,
            "random_string": "foo"
        }, {
            "random_int": 2,
            "random_string": "bar"
        }]

        schema_definition = DataLoader.expand_schema(
            [("random_string", "string"), ("random_int", "int")], [])

        # act
        result = self.inspector.inspect_attributes(samples, schema_definition)

        # assert
        self.assertEqual(1.0, result.attribute_integrity)
        self.assertEqual(.5, result.attribute_specification)
        self.assertEqual(.75, result.attribute_quality_index)
Beispiel #26
0
    def test_integrity_for_complex_type(self):
        # arrange
        schema = DataLoader.load_schema_with_name("schema_registry_avro.json")

        samples = [
            {
                "timestamp": 1595601702,
                "iss_position": {
                    "longitude": "-42.2948",
                    "latitude": "-40.3670"
                },
                "message": "success"
            },
            {
                "timestamp": 1595601702,
                "iss_position": {
                    "latitude": "-40.3670"
                },
                "message": "success"
            },
            {
                "timestamp": "wrong",
                "iss_position": {
                    "longitude": 666,
                    "latitude": "-40.0283"
                },
                "message": "success"
            },
        ]

        # act
        result = self.inspector.inspect_attributes(
            samples, SchemaDefinition.create(schema, False))

        # assert - only message is not mandatory so 3 out of 12 (3*4) are missing or wrong
        invalid_elements = 3
        all_elements = 12
        expected_integrity = (all_elements - invalid_elements) / all_elements
        self.assertAlmostEqual(
            expected_integrity, result.attribute_integrity, 3,
            f"Integrity must be {expected_integrity * 100}%")
Beispiel #27
0
    def test_specification_with_partial_specification(self) -> None:
        # arrange
        samples = [{
            "random_int": 1,
            "random_string": "foo"
        }, {
            "random_int": 2,
            "random_string": "bar"
        }]

        schema_definition = DataLoader.expand_schema(
            [("random_string", "string")], [])

        # act
        result = self.inspector.inspect_attributes(samples, schema_definition)

        # assert (half of the data is specified to .5)
        self.assertEqual(
            .25, result.attribute_specification,
            "Specification must be 25% because only half of the data is specified in schema"
        )
Beispiel #28
0
    def test_specification_with_irrelevant_specification(self) -> None:
        # arrange
        samples = [{
            "random_int": 1,
            "random_string": "foo"
        }, {
            "random_int": 2,
            "random_string": "bar"
        }]

        schema_definition = DataLoader.expand_schema(
            [("random_other", "string")], [])

        # act
        result = self.inspector.inspect_attributes(samples, schema_definition)

        # assert
        self.assertEqual(
            0, result.attribute_specification,
            "Specification must be 0% because none of the attributes are specified"
        )
Beispiel #29
0
    def test_integrity_with_additional_field(self) -> None:
        # arrange
        samples = [{
            "random_int": 1,
            "random_string": "abc"
        }, {
            "random_int": 2,
            "random_string": "efg"
        }, {
            "random_int": 3,
            "random_string": "hij"
        }]

        schema_definition = DataLoader.expand_schema(
            [("random_int", "integer")], [])

        # act
        result = self.inspector.inspect_attributes(samples, schema_definition)

        # assert
        self.assertEqual(1.0, result.attribute_integrity)
Beispiel #30
0
    def test_integrity_on_attribute_level_with_missing_value(self) -> None:
        # arrange
        samples = [
            {
                "random_int": 1002,
                "random_string": 1
            },
            {
                "random_int": 1003,
                "random_string": 2
            },
            {
                "random_int": "foo",
                "random_string": 3
            },
            {
                "random_int": 1005,
                "random_string": "fourth"
            },
        ]

        schema_definition = DataLoader.expand_schema(
            [("random_int", "integer"), ("random_string", "string")], [])

        # act
        result = self.inspector.inspect_attributes(samples, schema_definition)

        # assert
        attribute_details = result.attribute_details
        self.assertTrue('random_int' in attribute_details.keys(),
                        "Missing integrity for attribute random_int")
        self.assertTrue('random_string' in attribute_details.keys(),
                        "Missing integrity for attribute random_string")
        self.assertAlmostEqual(
            (3 / 4), attribute_details['random_int'].attribute_integrity, 3,
            "Integrity of random_int is not correct")
        self.assertAlmostEqual(
            (1 / 4), attribute_details['random_string'].attribute_integrity, 3,
            "Integrity of random_string is not correct")