Ejemplo n.º 1
0
    def test_compiler_stats_diff(self):
        data1 = pd.Series(["1", "9", "9"])
        data2 = pd.Series(["10", "9", "9", "9"])
        options = StructuredOptions()

        # Test normal diff
        compiler1 = col_pro_compilers.ColumnStatsProfileCompiler(data1)
        compiler2 = col_pro_compilers.ColumnStatsProfileCompiler(data2)
        expected_diff = {
            "order": ["ascending", "descending"],
            "categorical": "unchanged",
            "statistics": {
                "unique_count": "unchanged",
                "unique_ratio": 0.16666666666666663,
                "categories": [["1"], ["9"], ["10"]],
                "gini_impurity": 0.06944444444444448,
                "unalikeability": 0.16666666666666663,
                "categorical_count": {
                    "9": -1,
                    "1": [1, None],
                    "10": [None, 1]
                },
            },
            "chi2-test": {
                "chi2-statistic": 2.1,
                "df": 2,
                "p-value": 0.3499377491111554,
            },
        }
        self.assertDictEqual(expected_diff, compiler1.diff(compiler2))

        # Test disabled categorical column in one compiler
        options.category.is_enabled = False
        compiler1 = col_pro_compilers.ColumnStatsProfileCompiler(
            data1, options)
        compiler2 = col_pro_compilers.ColumnStatsProfileCompiler(data2)
        expected_diff = {"order": ["ascending", "descending"]}
        self.assertDictEqual(expected_diff, compiler1.diff(compiler2))

        # Test disabling categorical profile in both compilers
        compiler1 = col_pro_compilers.ColumnStatsProfileCompiler(
            data1, options)
        compiler2 = col_pro_compilers.ColumnStatsProfileCompiler(
            data2, options)
        expected_diff = {"order": ["ascending", "descending"]}
        self.assertDictEqual(expected_diff, compiler1.diff(compiler2))

        # Test disabling everything
        options.order.is_enabled = False
        compiler1 = col_pro_compilers.ColumnStatsProfileCompiler(
            data1, options)
        compiler2 = col_pro_compilers.ColumnStatsProfileCompiler(
            data2, options)
        expected_diff = {}
        self.assertDictEqual(expected_diff, compiler1.diff(compiler2))
Ejemplo n.º 2
0
    def test_compiler_stats_diff(self):
        data1 = pd.Series(['1', '9', '9'])
        data2 = pd.Series(['10', '9', '9', '9'])
        options = StructuredOptions()

        # Test normal diff
        compiler1 = col_pro_compilers.ColumnStatsProfileCompiler(data1)
        compiler2 = col_pro_compilers.ColumnStatsProfileCompiler(data2)
        expected_diff = {
            'order': ['ascending', 'descending'],
            'categorical': 'unchanged',
            'statistics': {
                'unique_count': 'unchanged',
                'unique_ratio': 0.16666666666666663,
                'categories': [['1'], ['9'], ['10']],
                'gini_impurity': 0.06944444444444448,
                'unalikeability': 0.16666666666666663,
                'categorical_count': {
                    '9': -1,
                    '1': [1, None],
                    '10': [None, 1]
                }
            },
            'chi2-test': {
                'chi2-statistic': 2.1,
                'df': 2,
                'p-value': 0.3499377491111554
            }
        }
        self.assertDictEqual(expected_diff, compiler1.diff(compiler2))

        # Test disabled categorical column in one compiler
        options.category.is_enabled = False
        compiler1 = col_pro_compilers.ColumnStatsProfileCompiler(
            data1, options)
        compiler2 = col_pro_compilers.ColumnStatsProfileCompiler(data2)
        expected_diff = {'order': ['ascending', 'descending']}
        self.assertDictEqual(expected_diff, compiler1.diff(compiler2))

        # Test disabling categorical profile in both compilers
        compiler1 = col_pro_compilers.ColumnStatsProfileCompiler(
            data1, options)
        compiler2 = col_pro_compilers.ColumnStatsProfileCompiler(
            data2, options)
        expected_diff = {'order': ['ascending', 'descending']}
        self.assertDictEqual(expected_diff, compiler1.diff(compiler2))

        # Test disabling everything
        options.order.is_enabled = False
        compiler1 = col_pro_compilers.ColumnStatsProfileCompiler(
            data1, options)
        compiler2 = col_pro_compilers.ColumnStatsProfileCompiler(
            data2, options)
        expected_diff = {}
        self.assertDictEqual(expected_diff, compiler1.diff(compiler2))
Ejemplo n.º 3
0
    def test_add_profilers(self):
        compiler1 = col_pro_compilers.BaseCompiler(mock.Mock())
        compiler2 = col_pro_compilers.BaseCompiler(mock.Mock())

        # test incorrect type
        with self.assertRaisesRegex(
                TypeError,
                "`BaseCompiler` and `int` are "
                "not of the same profile compiler type.",
        ):
            compiler1 + 3

        compiler3 = col_pro_compilers.ColumnStatsProfileCompiler(mock.Mock())
        compiler3._profiles = [mock.Mock()]
        with self.assertRaisesRegex(
                TypeError,
                "`BaseCompiler` and "
                "`ColumnStatsProfileCompiler` are "
                "not of the same profile compiler type.",
        ):
            compiler1 + compiler3

        # test mismatched names
        compiler1.name = "compiler1"
        compiler2.name = "compiler2"
        with self.assertRaisesRegex(
                ValueError, "Column profile names are unmatched: "
                "compiler1 != compiler2"):
            compiler1 + compiler2

        # test mismatched profiles due to options
        compiler2.name = "compiler1"
        compiler1._profiles = dict(test1=mock.Mock())
        compiler2._profiles = dict(test2=mock.Mock())
        with self.assertRaisesRegex(
                ValueError,
                "Column profilers were not setup with the "
                "same options, hence they do not calculate "
                "the same profiles and cannot be added "
                "together.",
        ):
            compiler1 + compiler2

        # test success
        compiler1._profiles = dict(test=1)
        compiler2._profiles = dict(test=2)
        merged_compiler = compiler1 + compiler2
        self.assertEqual(3, merged_compiler._profiles["test"])
        self.assertEqual("compiler1", merged_compiler.name)
Ejemplo n.º 4
0
    def test_add_profilers(self):
        compiler1 = col_pro_compilers.BaseColumnProfileCompiler(mock.Mock())
        compiler2 = col_pro_compilers.BaseColumnProfileCompiler(mock.Mock())

        # test incorrect type
        with self.assertRaisesRegex(
                TypeError, '`BaseColumnProfileCompiler` and `int` are '
                'not of the same profile compiler type.'):
            compiler1 + 3

        compiler3 = col_pro_compilers.ColumnStatsProfileCompiler(mock.Mock())
        compiler3._profiles = [mock.Mock()]
        with self.assertRaisesRegex(
                TypeError, '`BaseColumnProfileCompiler` and '
                '`ColumnStatsProfileCompiler` are '
                'not of the same profile compiler type.'):
            compiler1 + compiler3

        # test mismatched names
        compiler1.name = 'compiler1'
        compiler2.name = 'compiler2'
        with self.assertRaisesRegex(
                ValueError, 'Column profile names are unmatched: '
                'compiler1 != compiler2'):
            compiler1 + compiler2

        # test mismatched profiles due to options
        compiler2.name = 'compiler1'
        compiler1._profiles = dict(test1=mock.Mock())
        compiler2._profiles = dict(test2=mock.Mock())
        with self.assertRaisesRegex(
                ValueError, 'Column profilers were not setup with the '
                'same options, hence they do not calculate '
                'the same profiles and cannot be added '
                'together.'):
            compiler1 + compiler2

        # test success
        compiler1._profiles = dict(test=1)
        compiler2._profiles = dict(test=2)
        merged_compiler = compiler1 + compiler2
        self.assertEqual(3, merged_compiler._profiles['test'])
        self.assertEqual('compiler1', merged_compiler.name)
Ejemplo n.º 5
0
    def test_diff_primitive_compilers(self):
        # Test different data types
        data1 = pd.Series(["-2", "-1", "1", "2"])
        data2 = pd.Series(["YO YO YO", "HELLO"])
        compiler1 = col_pro_compilers.ColumnPrimitiveTypeProfileCompiler(data1)
        compiler2 = col_pro_compilers.ColumnPrimitiveTypeProfileCompiler(data2)

        expected_diff = {
            "data_type_representation": {
                "datetime": "unchanged",
                "int": 1.0,
                "float": 1.0,
                "text": "unchanged",
            },
            "data_type": ["int", "text"],
        }
        self.assertDictEqual(expected_diff, compiler1.diff(compiler2))

        # Test different data types with datetime specifically
        data1 = pd.Series(["-2", "-1", "1", "2"])
        data2 = pd.Series(["01/12/1967", "11/9/2024"])
        compiler1 = col_pro_compilers.ColumnPrimitiveTypeProfileCompiler(data1)
        compiler2 = col_pro_compilers.ColumnPrimitiveTypeProfileCompiler(data2)

        expected_diff = {
            "data_type_representation": {
                "datetime": -1.0,
                "int": 1.0,
                "float": 1.0,
                "text": "unchanged",
            },
            "data_type": ["int", "datetime"],
        }
        self.assertDictEqual(expected_diff, compiler1.diff(compiler2))

        # Test same data types
        data1 = pd.Series(["-2", "15", "1", "2"])
        data2 = pd.Series(["5", "-1"])

        compiler1 = col_pro_compilers.ColumnPrimitiveTypeProfileCompiler(data1)
        compiler2 = col_pro_compilers.ColumnPrimitiveTypeProfileCompiler(data2)

        expected_diff = {
            "data_type_representation": {
                "datetime": "unchanged",
                "int": "unchanged",
                "float": "unchanged",
                "text": "unchanged",
            },
            "data_type": "unchanged",
            "statistics": {
                "min": -1.0,
                "max": 10.0,
                "sum": 12.0,
                "mean": 2.0,
                "median": -0.5,
                "mode": [[-2, 15, 1, 2], [], [5, -1]],
                "median_absolute_deviation": -1,
                "variance": 38.666666666666664,
                "stddev": 3.285085839971525,
                "t-test": {
                    "t-statistic": 0.4155260166386663,
                    "conservative": {
                        "df": 1,
                        "p-value": 0.749287157907667
                    },
                    "welch": {
                        "df": 3.6288111187629117,
                        "p-value": 0.7011367179395704
                    },
                },
            },
        }
        profile_diff = compiler1.diff(compiler2)
        self.assertAlmostEqual(
            expected_diff["statistics"].pop("median"),
            profile_diff["statistics"].pop("median"),
            places=2,
        )
        expected_diff_mode = expected_diff["statistics"].pop("mode")
        diff_mode = profile_diff["statistics"].pop("mode")
        for i in range(len(expected_diff_mode)):
            np.testing.assert_almost_equal(sorted(expected_diff_mode[i]),
                                           sorted(diff_mode[i]), 2)
        self.assertAlmostEqual(
            expected_diff["statistics"].pop("median_absolute_deviation"),
            profile_diff["statistics"].pop("median_absolute_deviation"),
            places=2,
        )
        self.assertDictEqual(expected_diff, profile_diff)

        # Test different compilers
        data1 = pd.Series(["-2", "-1", "1", "2"])
        data2 = pd.Series(["5", "15"])

        compiler1 = col_pro_compilers.ColumnPrimitiveTypeProfileCompiler(data1)
        compiler2 = col_pro_compilers.ColumnStatsProfileCompiler(data2)
        # Assert type error is properly called
        with self.assertRaises(TypeError) as exc:
            compiler1.diff(compiler2)
        self.assertEqual(
            str(exc.exception),
            "`ColumnPrimitiveTypeProfileCompiler` and "
            "`ColumnStatsProfileCompiler` are not of the same "
            "profile compiler type.",
        )
Ejemplo n.º 6
0
    def test_diff_primitive_compilers(self):
        # Test different data types
        data1 = pd.Series(['-2', '-1', '1', '2'])
        data2 = pd.Series(["YO YO YO", "HELLO"])
        compiler1 = col_pro_compilers.ColumnPrimitiveTypeProfileCompiler(data1)
        compiler2 = col_pro_compilers.ColumnPrimitiveTypeProfileCompiler(data2)

        expected_diff = {
            'data_type_representation': {
                'datetime': 'unchanged',
                'int': 1.0,
                'float': 1.0,
                'text': 'unchanged'
            },
            'data_type': ['int', 'text']
        }
        self.assertDictEqual(expected_diff, compiler1.diff(compiler2))

        # Test different data types with datetime specifically
        data1 = pd.Series(['-2', '-1', '1', '2'])
        data2 = pd.Series(["01/12/1967", "11/9/2024"])
        compiler1 = col_pro_compilers.ColumnPrimitiveTypeProfileCompiler(data1)
        compiler2 = col_pro_compilers.ColumnPrimitiveTypeProfileCompiler(data2)

        expected_diff = {
            'data_type_representation': {
                'datetime': -1.0,
                'int': 1.0,
                'float': 1.0,
                'text': 'unchanged'
            },
            'data_type': ['int', 'datetime']
        }
        self.assertDictEqual(expected_diff, compiler1.diff(compiler2))

        # Test same data types
        data1 = pd.Series(['-2', '15', '1', '2'])
        data2 = pd.Series(['5', '-1'])

        compiler1 = col_pro_compilers.ColumnPrimitiveTypeProfileCompiler(data1)
        compiler2 = col_pro_compilers.ColumnPrimitiveTypeProfileCompiler(data2)

        expected_diff = {
            'data_type_representation': {
                'datetime': 'unchanged',
                'int': 'unchanged',
                'float': 'unchanged',
                'text': 'unchanged'
            },
            'data_type': 'unchanged',
            'statistics': {
                'min': -1.0,
                'max': 10.0,
                'sum': 12.0,
                'mean': 2.0,
                'median': -0.5,
                'mode': [[-2, 15, 1, 2], [], [5, -1]],
                'median_absolute_deviation': -1,
                'variance': 38.666666666666664,
                'stddev': 3.285085839971525,
                't-test': {
                    't-statistic': 0.4155260166386663,
                    'conservative': {
                        'df': 1,
                        'p-value': 0.749287157907667
                    },
                    'welch': {
                        'df': 3.6288111187629117,
                        'p-value': 0.7011367179395704
                    }
                }
            }
        }
        profile_diff = compiler1.diff(compiler2)
        self.assertAlmostEqual(expected_diff['statistics'].pop('median'),
                               profile_diff['statistics'].pop('median'),
                               places=2)
        expected_diff_mode = expected_diff['statistics'].pop('mode')
        diff_mode = profile_diff['statistics'].pop('mode')
        for i in range(len(expected_diff_mode)):
            np.testing.assert_almost_equal(sorted(expected_diff_mode[i]),
                                           sorted(diff_mode[i]), 2)
        self.assertAlmostEqual(
            expected_diff['statistics'].pop('median_absolute_deviation'),
            profile_diff['statistics'].pop('median_absolute_deviation'),
            places=2)
        self.assertDictEqual(expected_diff, profile_diff)

        # Test different compilers
        data1 = pd.Series(['-2', '-1', '1', '2'])
        data2 = pd.Series(['5', '15'])

        compiler1 = col_pro_compilers.ColumnPrimitiveTypeProfileCompiler(data1)
        compiler2 = col_pro_compilers.ColumnStatsProfileCompiler(data2)
        # Assert type error is properly called
        with self.assertRaises(TypeError) as exc:
            compiler1.diff(compiler2)
        self.assertEqual(
            str(exc.exception), "`ColumnPrimitiveTypeProfileCompiler` and "
            "`ColumnStatsProfileCompiler` are not of the same "
            "profile compiler type.")