def test_categorical_diff(self): df_categorical = pd.Series(["y", "y", "y", "y", "n", "n", "n"]) profile = CategoricalColumn(df_categorical.name) profile.update(df_categorical) df_categorical = pd.Series(["y", "maybe", "y", "y", "n", "n", "maybe"]) profile2 = CategoricalColumn(df_categorical.name) profile2.update(df_categorical) # chi2-statistic = sum((observed-expected)^2/expected for each category in each column) # df = categories - 1 # p-value found through using chi2 CDF expected_diff = { "categorical": "unchanged", "statistics": { "unique_count": -1, "unique_ratio": -0.14285714285714285, "categories": [[], ["y", "n"], ["maybe"]], "gini_impurity": -0.16326530612244894, "unalikeability": -0.19047619047619047, "categorical_count": { "y": 1, "n": 1, "maybe": [None, 2] }, }, "chi2-test": { "chi2-statistic": 82 / 35, "df": 2, "p-value": 0.3099238764710244, }, } self.assertDictEqual(expected_diff, profile.diff(profile2)) # Test with one categorical column matching df_not_categorical = pd.Series([ "THIS", "is", "not", "a", "categorical", "column", "for", "testing", "purposes", "Bada", "Bing", "Badaboom", ]) profile2 = CategoricalColumn(df_not_categorical.name) profile2.update(df_not_categorical) expected_diff = { "categorical": [True, False], "statistics": { "unique_count": -10, "unique_ratio": -0.7142857142857143 }, } self.assertDictEqual(expected_diff, profile.diff(profile2))
def test_categorical_diff(self): df_categorical = pd.Series(["y", "y", "y", "y", "n", "n", "n"]) profile = CategoricalColumn(df_categorical.name) profile.update(df_categorical) df_categorical = pd.Series(["y", "maybe", "y", "y", "n", "n", "maybe"]) profile2 = CategoricalColumn(df_categorical.name) profile2.update(df_categorical) # chi2-statistic = sum((observed-expected)^2/expected for each category in each column) # df = categories - 1 # p-value found through using chi2 CDF expected_diff = { 'categorical': 'unchanged', 'statistics': { 'unique_count': -1, 'unique_ratio': -0.14285714285714285, 'categories': [[], ['y', 'n'], ['maybe']], 'gini_impurity': -0.16326530612244894, 'unalikeability': -0.19047619047619047, 'categorical_count': { 'y': 1, 'n': 1, 'maybe': [None, 2] } }, 'chi2-test': { 'chi2-statistic': 82 / 35, 'df': 2, 'p-value': 0.3099238764710244 } } self.assertDictEqual(expected_diff, profile.diff(profile2)) # Test with one categorical column matching df_not_categorical = pd.Series([ "THIS", "is", "not", "a", "categorical", "column", "for", "testing", "purposes", "Bada", "Bing", "Badaboom" ]) profile2 = CategoricalColumn(df_not_categorical.name) profile2.update(df_not_categorical) expected_diff = { 'categorical': [True, False], 'statistics': { 'unique_count': -10, 'unique_ratio': -0.7142857142857143 } } self.assertDictEqual(expected_diff, profile.diff(profile2))