def test_compiler_stats_diff(self): data1 = pd.Series(["1", "9", "9"]) data2 = pd.Series(["10", "9", "9", "9"]) options = StructuredOptions() # Test normal diff compiler1 = col_pro_compilers.ColumnStatsProfileCompiler(data1) compiler2 = col_pro_compilers.ColumnStatsProfileCompiler(data2) expected_diff = { "order": ["ascending", "descending"], "categorical": "unchanged", "statistics": { "unique_count": "unchanged", "unique_ratio": 0.16666666666666663, "categories": [["1"], ["9"], ["10"]], "gini_impurity": 0.06944444444444448, "unalikeability": 0.16666666666666663, "categorical_count": { "9": -1, "1": [1, None], "10": [None, 1] }, }, "chi2-test": { "chi2-statistic": 2.1, "df": 2, "p-value": 0.3499377491111554, }, } self.assertDictEqual(expected_diff, compiler1.diff(compiler2)) # Test disabled categorical column in one compiler options.category.is_enabled = False compiler1 = col_pro_compilers.ColumnStatsProfileCompiler( data1, options) compiler2 = col_pro_compilers.ColumnStatsProfileCompiler(data2) expected_diff = {"order": ["ascending", "descending"]} self.assertDictEqual(expected_diff, compiler1.diff(compiler2)) # Test disabling categorical profile in both compilers compiler1 = col_pro_compilers.ColumnStatsProfileCompiler( data1, options) compiler2 = col_pro_compilers.ColumnStatsProfileCompiler( data2, options) expected_diff = {"order": ["ascending", "descending"]} self.assertDictEqual(expected_diff, compiler1.diff(compiler2)) # Test disabling everything options.order.is_enabled = False compiler1 = col_pro_compilers.ColumnStatsProfileCompiler( data1, options) compiler2 = col_pro_compilers.ColumnStatsProfileCompiler( data2, options) expected_diff = {} self.assertDictEqual(expected_diff, compiler1.diff(compiler2))
def test_compiler_stats_diff(self): data1 = pd.Series(['1', '9', '9']) data2 = pd.Series(['10', '9', '9', '9']) options = StructuredOptions() # Test normal diff compiler1 = col_pro_compilers.ColumnStatsProfileCompiler(data1) compiler2 = col_pro_compilers.ColumnStatsProfileCompiler(data2) expected_diff = { 'order': ['ascending', 'descending'], 'categorical': 'unchanged', 'statistics': { 'unique_count': 'unchanged', 'unique_ratio': 0.16666666666666663, 'categories': [['1'], ['9'], ['10']], 'gini_impurity': 0.06944444444444448, 'unalikeability': 0.16666666666666663, 'categorical_count': { '9': -1, '1': [1, None], '10': [None, 1] } }, 'chi2-test': { 'chi2-statistic': 2.1, 'df': 2, 'p-value': 0.3499377491111554 } } self.assertDictEqual(expected_diff, compiler1.diff(compiler2)) # Test disabled categorical column in one compiler options.category.is_enabled = False compiler1 = col_pro_compilers.ColumnStatsProfileCompiler( data1, options) compiler2 = col_pro_compilers.ColumnStatsProfileCompiler(data2) expected_diff = {'order': ['ascending', 'descending']} self.assertDictEqual(expected_diff, compiler1.diff(compiler2)) # Test disabling categorical profile in both compilers compiler1 = col_pro_compilers.ColumnStatsProfileCompiler( data1, options) compiler2 = col_pro_compilers.ColumnStatsProfileCompiler( data2, options) expected_diff = {'order': ['ascending', 'descending']} self.assertDictEqual(expected_diff, compiler1.diff(compiler2)) # Test disabling everything options.order.is_enabled = False compiler1 = col_pro_compilers.ColumnStatsProfileCompiler( data1, options) compiler2 = col_pro_compilers.ColumnStatsProfileCompiler( data2, options) expected_diff = {} self.assertDictEqual(expected_diff, compiler1.diff(compiler2))
def test_add_profilers(self): compiler1 = col_pro_compilers.BaseCompiler(mock.Mock()) compiler2 = col_pro_compilers.BaseCompiler(mock.Mock()) # test incorrect type with self.assertRaisesRegex( TypeError, "`BaseCompiler` and `int` are " "not of the same profile compiler type.", ): compiler1 + 3 compiler3 = col_pro_compilers.ColumnStatsProfileCompiler(mock.Mock()) compiler3._profiles = [mock.Mock()] with self.assertRaisesRegex( TypeError, "`BaseCompiler` and " "`ColumnStatsProfileCompiler` are " "not of the same profile compiler type.", ): compiler1 + compiler3 # test mismatched names compiler1.name = "compiler1" compiler2.name = "compiler2" with self.assertRaisesRegex( ValueError, "Column profile names are unmatched: " "compiler1 != compiler2"): compiler1 + compiler2 # test mismatched profiles due to options compiler2.name = "compiler1" compiler1._profiles = dict(test1=mock.Mock()) compiler2._profiles = dict(test2=mock.Mock()) with self.assertRaisesRegex( ValueError, "Column profilers were not setup with the " "same options, hence they do not calculate " "the same profiles and cannot be added " "together.", ): compiler1 + compiler2 # test success compiler1._profiles = dict(test=1) compiler2._profiles = dict(test=2) merged_compiler = compiler1 + compiler2 self.assertEqual(3, merged_compiler._profiles["test"]) self.assertEqual("compiler1", merged_compiler.name)
def test_add_profilers(self): compiler1 = col_pro_compilers.BaseColumnProfileCompiler(mock.Mock()) compiler2 = col_pro_compilers.BaseColumnProfileCompiler(mock.Mock()) # test incorrect type with self.assertRaisesRegex( TypeError, '`BaseColumnProfileCompiler` and `int` are ' 'not of the same profile compiler type.'): compiler1 + 3 compiler3 = col_pro_compilers.ColumnStatsProfileCompiler(mock.Mock()) compiler3._profiles = [mock.Mock()] with self.assertRaisesRegex( TypeError, '`BaseColumnProfileCompiler` and ' '`ColumnStatsProfileCompiler` are ' 'not of the same profile compiler type.'): compiler1 + compiler3 # test mismatched names compiler1.name = 'compiler1' compiler2.name = 'compiler2' with self.assertRaisesRegex( ValueError, 'Column profile names are unmatched: ' 'compiler1 != compiler2'): compiler1 + compiler2 # test mismatched profiles due to options compiler2.name = 'compiler1' compiler1._profiles = dict(test1=mock.Mock()) compiler2._profiles = dict(test2=mock.Mock()) with self.assertRaisesRegex( ValueError, 'Column profilers were not setup with the ' 'same options, hence they do not calculate ' 'the same profiles and cannot be added ' 'together.'): compiler1 + compiler2 # test success compiler1._profiles = dict(test=1) compiler2._profiles = dict(test=2) merged_compiler = compiler1 + compiler2 self.assertEqual(3, merged_compiler._profiles['test']) self.assertEqual('compiler1', merged_compiler.name)
def test_diff_primitive_compilers(self): # Test different data types data1 = pd.Series(["-2", "-1", "1", "2"]) data2 = pd.Series(["YO YO YO", "HELLO"]) compiler1 = col_pro_compilers.ColumnPrimitiveTypeProfileCompiler(data1) compiler2 = col_pro_compilers.ColumnPrimitiveTypeProfileCompiler(data2) expected_diff = { "data_type_representation": { "datetime": "unchanged", "int": 1.0, "float": 1.0, "text": "unchanged", }, "data_type": ["int", "text"], } self.assertDictEqual(expected_diff, compiler1.diff(compiler2)) # Test different data types with datetime specifically data1 = pd.Series(["-2", "-1", "1", "2"]) data2 = pd.Series(["01/12/1967", "11/9/2024"]) compiler1 = col_pro_compilers.ColumnPrimitiveTypeProfileCompiler(data1) compiler2 = col_pro_compilers.ColumnPrimitiveTypeProfileCompiler(data2) expected_diff = { "data_type_representation": { "datetime": -1.0, "int": 1.0, "float": 1.0, "text": "unchanged", }, "data_type": ["int", "datetime"], } self.assertDictEqual(expected_diff, compiler1.diff(compiler2)) # Test same data types data1 = pd.Series(["-2", "15", "1", "2"]) data2 = pd.Series(["5", "-1"]) compiler1 = col_pro_compilers.ColumnPrimitiveTypeProfileCompiler(data1) compiler2 = col_pro_compilers.ColumnPrimitiveTypeProfileCompiler(data2) expected_diff = { "data_type_representation": { "datetime": "unchanged", "int": "unchanged", "float": "unchanged", "text": "unchanged", }, "data_type": "unchanged", "statistics": { "min": -1.0, "max": 10.0, "sum": 12.0, "mean": 2.0, "median": -0.5, "mode": [[-2, 15, 1, 2], [], [5, -1]], "median_absolute_deviation": -1, "variance": 38.666666666666664, "stddev": 3.285085839971525, "t-test": { "t-statistic": 0.4155260166386663, "conservative": { "df": 1, "p-value": 0.749287157907667 }, "welch": { "df": 3.6288111187629117, "p-value": 0.7011367179395704 }, }, }, } profile_diff = compiler1.diff(compiler2) self.assertAlmostEqual( expected_diff["statistics"].pop("median"), profile_diff["statistics"].pop("median"), places=2, ) expected_diff_mode = expected_diff["statistics"].pop("mode") diff_mode = profile_diff["statistics"].pop("mode") for i in range(len(expected_diff_mode)): np.testing.assert_almost_equal(sorted(expected_diff_mode[i]), sorted(diff_mode[i]), 2) self.assertAlmostEqual( expected_diff["statistics"].pop("median_absolute_deviation"), profile_diff["statistics"].pop("median_absolute_deviation"), places=2, ) self.assertDictEqual(expected_diff, profile_diff) # Test different compilers data1 = pd.Series(["-2", "-1", "1", "2"]) data2 = pd.Series(["5", "15"]) compiler1 = col_pro_compilers.ColumnPrimitiveTypeProfileCompiler(data1) compiler2 = col_pro_compilers.ColumnStatsProfileCompiler(data2) # Assert type error is properly called with self.assertRaises(TypeError) as exc: compiler1.diff(compiler2) self.assertEqual( str(exc.exception), "`ColumnPrimitiveTypeProfileCompiler` and " "`ColumnStatsProfileCompiler` are not of the same " "profile compiler type.", )
def test_diff_primitive_compilers(self): # Test different data types data1 = pd.Series(['-2', '-1', '1', '2']) data2 = pd.Series(["YO YO YO", "HELLO"]) compiler1 = col_pro_compilers.ColumnPrimitiveTypeProfileCompiler(data1) compiler2 = col_pro_compilers.ColumnPrimitiveTypeProfileCompiler(data2) expected_diff = { 'data_type_representation': { 'datetime': 'unchanged', 'int': 1.0, 'float': 1.0, 'text': 'unchanged' }, 'data_type': ['int', 'text'] } self.assertDictEqual(expected_diff, compiler1.diff(compiler2)) # Test different data types with datetime specifically data1 = pd.Series(['-2', '-1', '1', '2']) data2 = pd.Series(["01/12/1967", "11/9/2024"]) compiler1 = col_pro_compilers.ColumnPrimitiveTypeProfileCompiler(data1) compiler2 = col_pro_compilers.ColumnPrimitiveTypeProfileCompiler(data2) expected_diff = { 'data_type_representation': { 'datetime': -1.0, 'int': 1.0, 'float': 1.0, 'text': 'unchanged' }, 'data_type': ['int', 'datetime'] } self.assertDictEqual(expected_diff, compiler1.diff(compiler2)) # Test same data types data1 = pd.Series(['-2', '15', '1', '2']) data2 = pd.Series(['5', '-1']) compiler1 = col_pro_compilers.ColumnPrimitiveTypeProfileCompiler(data1) compiler2 = col_pro_compilers.ColumnPrimitiveTypeProfileCompiler(data2) expected_diff = { 'data_type_representation': { 'datetime': 'unchanged', 'int': 'unchanged', 'float': 'unchanged', 'text': 'unchanged' }, 'data_type': 'unchanged', 'statistics': { 'min': -1.0, 'max': 10.0, 'sum': 12.0, 'mean': 2.0, 'median': -0.5, 'mode': [[-2, 15, 1, 2], [], [5, -1]], 'median_absolute_deviation': -1, 'variance': 38.666666666666664, 'stddev': 3.285085839971525, 't-test': { 't-statistic': 0.4155260166386663, 'conservative': { 'df': 1, 'p-value': 0.749287157907667 }, 'welch': { 'df': 3.6288111187629117, 'p-value': 0.7011367179395704 } } } } profile_diff = compiler1.diff(compiler2) self.assertAlmostEqual(expected_diff['statistics'].pop('median'), profile_diff['statistics'].pop('median'), places=2) expected_diff_mode = expected_diff['statistics'].pop('mode') diff_mode = profile_diff['statistics'].pop('mode') for i in range(len(expected_diff_mode)): np.testing.assert_almost_equal(sorted(expected_diff_mode[i]), sorted(diff_mode[i]), 2) self.assertAlmostEqual( expected_diff['statistics'].pop('median_absolute_deviation'), profile_diff['statistics'].pop('median_absolute_deviation'), places=2) self.assertDictEqual(expected_diff, profile_diff) # Test different compilers data1 = pd.Series(['-2', '-1', '1', '2']) data2 = pd.Series(['5', '15']) compiler1 = col_pro_compilers.ColumnPrimitiveTypeProfileCompiler(data1) compiler2 = col_pro_compilers.ColumnStatsProfileCompiler(data2) # Assert type error is properly called with self.assertRaises(TypeError) as exc: compiler1.diff(compiler2) self.assertEqual( str(exc.exception), "`ColumnPrimitiveTypeProfileCompiler` and " "`ColumnStatsProfileCompiler` are not of the same " "profile compiler type.")