Esempio n. 1
0
    def test_categorical_diff(self):
        df_categorical = pd.Series(["y", "y", "y", "y", "n", "n", "n"])
        profile = CategoricalColumn(df_categorical.name)
        profile.update(df_categorical)

        df_categorical = pd.Series(["y", "maybe", "y", "y", "n", "n", "maybe"])
        profile2 = CategoricalColumn(df_categorical.name)
        profile2.update(df_categorical)

        # chi2-statistic = sum((observed-expected)^2/expected for each category in each column)
        # df = categories - 1
        # p-value found through using chi2 CDF
        expected_diff = {
            "categorical": "unchanged",
            "statistics": {
                "unique_count": -1,
                "unique_ratio": -0.14285714285714285,
                "categories": [[], ["y", "n"], ["maybe"]],
                "gini_impurity": -0.16326530612244894,
                "unalikeability": -0.19047619047619047,
                "categorical_count": {
                    "y": 1,
                    "n": 1,
                    "maybe": [None, 2]
                },
            },
            "chi2-test": {
                "chi2-statistic": 82 / 35,
                "df": 2,
                "p-value": 0.3099238764710244,
            },
        }

        self.assertDictEqual(expected_diff, profile.diff(profile2))

        # Test with one categorical column matching
        df_not_categorical = pd.Series([
            "THIS",
            "is",
            "not",
            "a",
            "categorical",
            "column",
            "for",
            "testing",
            "purposes",
            "Bada",
            "Bing",
            "Badaboom",
        ])
        profile2 = CategoricalColumn(df_not_categorical.name)
        profile2.update(df_not_categorical)
        expected_diff = {
            "categorical": [True, False],
            "statistics": {
                "unique_count": -10,
                "unique_ratio": -0.7142857142857143
            },
        }
        self.assertDictEqual(expected_diff, profile.diff(profile2))
    def test_categorical_diff(self):
        df_categorical = pd.Series(["y", "y", "y", "y", "n", "n", "n"])
        profile = CategoricalColumn(df_categorical.name)
        profile.update(df_categorical)

        df_categorical = pd.Series(["y", "maybe", "y", "y", "n", "n", "maybe"])
        profile2 = CategoricalColumn(df_categorical.name)
        profile2.update(df_categorical)

        # chi2-statistic = sum((observed-expected)^2/expected for each category in each column)
        # df = categories - 1
        # p-value found through using chi2 CDF
        expected_diff = {
            'categorical': 'unchanged',
            'statistics': {
                'unique_count': -1,
                'unique_ratio': -0.14285714285714285,
                'categories': [[], ['y', 'n'], ['maybe']],
                'gini_impurity': -0.16326530612244894,
                'unalikeability': -0.19047619047619047,
                'categorical_count': {
                    'y': 1,
                    'n': 1,
                    'maybe': [None, 2]
                }
            },
            'chi2-test': {
                'chi2-statistic': 82 / 35,
                'df': 2,
                'p-value': 0.3099238764710244
            }
        }

        self.assertDictEqual(expected_diff, profile.diff(profile2))

        # Test with one categorical column matching
        df_not_categorical = pd.Series([
            "THIS", "is", "not", "a", "categorical", "column", "for",
            "testing", "purposes", "Bada", "Bing", "Badaboom"
        ])
        profile2 = CategoricalColumn(df_not_categorical.name)
        profile2.update(df_not_categorical)
        expected_diff = {
            'categorical': [True, False],
            'statistics': {
                'unique_count': -10,
                'unique_ratio': -0.7142857142857143
            }
        }
        self.assertDictEqual(expected_diff, profile.diff(profile2))