Ejemplos de CategoricalColumn en Python, ejemplos de dataprofiler.profilers.CategoricalColumn en Python

Ejemplo n.º 1

0

Mostrar archivo

    def test_mixed_categorical_col_integer_string(self):
        dataset = self.aws_dataset["localeabbr"].dropna()
        profile = CategoricalColumn(dataset.name)
        profile.update(dataset)

        categories = {
            '36', 'OR', 'IL', '41', '51', '13', '21', 'WA',
            '11', 'CA', '37', 'TX', '10', 'SPE', '34', '32', '35',
            '23', 'NM', 'NV', '33', '44', '22', 'GR', '15', 'MI',
            '43', 'FL', 'TA', 'KY', 'SP', 'SE', 'AZ', '42', 'NJ',
            'DC', '77', '50', 'NGR', '31', 'DIF', '61', '45',
            'NY', 'MH', 'ALT', 'CH', 'NSW', 'MS', '81', 'GP',
            'KU', '14', '53', '64', 'AP', '38', 'IRK', 'CL',
            'TXG', 'LUA', 'ANT', 'PA', 'QC', 'RS', 'MO', 'C',
            'MOW', 'ENG', 'ON', 'CE', 'TN', 'PI', 'VLG', 'DL',
            'VL', 'GE', 'WP', 'GO', 'BS', 'KEM', 'MA', 'BEL',
            'LB', 'CU', 'EC', 'PB', 'RIX', 'B', 'RJ', 'VA',
            '7', 'SL', 'BE', '47', 'RM', 'BIH', 'SD', 'OH',
            'PR', 'M', 'SN', 'COR', '63', 'E', 'BD', 'VI',
            'SAM', 'BA', 'WY', '62', '4', 'PER', 'WKO', 'KYA',
            '6', 'MN', 'SA', '8', 'CO', 'IS', 'RIS', 'FS',
            'IN', 'LIV', 'IA', '24', 'VIC', '27', '16', 'PK',
            'WB', 'NH', 'DAS', 'CT', 'CN', 'BIR', 'NVS', 'MG',
            '3', 'PH', 'TO', '1', 'HE', 'VGG', 'BU', 'AB',
            'NIZ', '92', '46', 'MZ', 'FR'
        }

        self.assertEqual(2120, profile.sample_size)
        six.assertCountEqual(self, categories, profile.categories)

Ejemplo n.º 2

0

Mostrar archivo

Archivo: test_categorical_column_profile.py Proyecto: bballamudi/DataProfiler

 def test_true_categorical_report(self):
     df_categorical = pd.Series([
         "a",
         "a",
         "a",
         "b",
         "b",
         "b",
         "b",
         "c",
         "c",
         "c",
         "c",
         "c",
     ])
     profile = CategoricalColumn(df_categorical.name)
     profile.update(df_categorical)
     report = profile.profile
     six.assertCountEqual(self, ['categorical', 'statistics', 'times'],
                          report)
     self.assertTrue(report["categorical"])
     six.assertCountEqual(self,
                          ['unique_count', 'unique_ratio', 'categories'],
                          report['statistics'])
     self.assertEqual(3, report["statistics"]["unique_count"])
     self.assertEqual(0.25, report["statistics"]["unique_ratio"])
     self.assertCountEqual(["a", "b", "c"],
                           report["statistics"]["categories"])

Ejemplo n.º 3

0

Mostrar archivo

Archivo: test_categorical_column_profile.py Proyecto: bballamudi/DataProfiler

 def test_correct_categorical_model_string(self):
     dataset = self.aws_dataset["host"].dropna()
     profile = CategoricalColumn(dataset.name)
     profile.update(dataset)
     self.assertEqual(1.0, profile.is_match)
     self.assertEqual(2997, profile.sample_size)
     categories = {
         'groucho-oregon', 'groucho-us-east', 'groucho-singapore',
         'groucho-tokyo', 'groucho-sa', 'zeppo-norcal', 'groucho-norcal',
         'groucho-eu', 'groucho-sydney'
     }
     six.assertCountEqual(self, categories, profile.categories)

Ejemplo n.º 4

0

Mostrar archivo

Archivo: test_categorical_column_profile.py Proyecto: bballamudi/DataProfiler

    def test_false_categorical_report(self):
        df_non_categorical = pd.Series(list(map(str, range(0, 20))))
        profile = CategoricalColumn(df_non_categorical.name)
        profile.update(df_non_categorical)

        report = profile.profile
        six.assertCountEqual(self, ['categorical', 'statistics', 'times'],
                             report)
        self.assertFalse(report["categorical"])
        six.assertCountEqual(self, ['unique_count', 'unique_ratio'],
                             report['statistics'])
        self.assertEqual(20, report["statistics"]["unique_count"])
        self.assertEqual(1.0, report["statistics"]["unique_ratio"])

Ejemplo n.º 5

0

Mostrar archivo

Archivo: test_categorical_column_profile.py Proyecto: JGSweets/data-profiler

    def test_false_categorical_report(self):
        df_non_categorical = pd.Series(list(map(str, range(0, 20))))
        profile = CategoricalColumn(df_non_categorical.name)
        profile.update(df_non_categorical)

        report = profile.profile
        self.assertIsNotNone(report.pop("times", None))
        expected_profile = dict(
            categorical=False,
            statistics=dict([
                ('unique_count', 20),
                ('unique_ratio', 1),
            ]),
        )
        self.assertEqual(report, expected_profile)

Ejemplo n.º 6

0

Mostrar archivo

    def test_true_categorical_report(self):
        df_categorical = pd.Series([
            "a",
            "a",
            "a",
            "b",
            "b",
            "b",
            "b",
            "c",
            "c",
            "c",
            "c",
            "c",
        ])
        profile = CategoricalColumn(df_categorical.name)
        profile.update(df_categorical)
        report = profile.profile

        self.assertIsNotNone(report.pop("times", None))
        expected_profile = dict(
            categorical=True,
            statistics=dict([
                ("unique_count", 3),
                ("unique_ratio", 0.25),
                ("categories", ["a", "b", "c"]),
                ("categorical_count", {
                    "a": 3,
                    "b": 4,
                    "c": 5
                }),
                ("gini_impurity", (27 / 144) + (32 / 144) + (35 / 144)),
                ("unalikeability", 2 * (12 + 15 + 20) / 132),
            ]),
        )

        # We have to pop these values because sometimes the order changes
        self.assertCountEqual(
            expected_profile["statistics"].pop("categories"),
            report["statistics"].pop("categories"),
        )
        self.assertCountEqual(
            expected_profile["statistics"].pop("categorical_count"),
            report["statistics"].pop("categorical_count"),
        )
        self.assertEqual(report, expected_profile)

Ejemplo n.º 7

0

Mostrar archivo

 def test_categorical_column_with_wrong_options(self):
     with self.assertRaisesRegex(
             ValueError,
             "CategoricalColumn parameter 'options' must"
             " be of type CategoricalOptions.",
     ):
         profiler = CategoricalColumn("Categorical",
                                      options="wrong_data_type")

Ejemplo n.º 8

0

Mostrar archivo

    def test_report(self):
        df_non_categorical = pd.Series(list(map(str, range(0, 20))))
        profile = CategoricalColumn(df_non_categorical.name)
        profile.update(df_non_categorical)

        report1 = profile.profile
        report2 = profile.report(remove_disabled_flag=False)
        report3 = profile.report(remove_disabled_flag=True)
        self.assertDictEqual(report1, report2)
        self.assertDictEqual(report1, report3)

Ejemplo n.º 9

0

Mostrar archivo

Archivo: test_categorical_column_profile.py Proyecto: JGSweets/data-profiler

    def test_top_k_categories_change(self):
        # Test if top_k_categories is None
        options = CategoricalOptions()
        df_series = pd.Series(
            ["a", "a", "b", "c", "d", "e", "e", "e", "f", "g"])
        profile = CategoricalColumn(df_series.name, options)
        profile.update(df_series)
        self.assertEqual(
            len(profile.profile['statistics']['categorical_count']), 7)

        # Test if top_k_categories is less than the count of categories
        profile._top_k_categories = 6
        self.assertEqual(
            len(profile.profile['statistics']['categorical_count']), 6)

        # Test if top_k_categories is greater than the count of categories
        options.top_k_categories = 6
        df_series = pd.Series(["a", "a", "b", "c", "d"])
        profile = CategoricalColumn(df_series.name, options)
        profile.update(df_series)
        self.assertEqual(
            len(profile.profile['statistics']['categorical_count']), 4)

Ejemplo n.º 10

0

Mostrar archivo

Archivo: test_categorical_column_profile.py Proyecto: bballamudi/DataProfiler

    def test_categorical_merge(self):
        df1 = pd.Series([
            "abcd", "aa", "abcd", "aa", "b", "4", "3", "2", "dfd", "2", np.nan
        ])
        df2 = pd.Series([
            "1", "null", "ee", "NaN", "ff", "nan", "gg", "None", "aa", "b",
            "ee"
        ])

        # Expected is based off insertion order
        expected_categories = [
            'abcd', 'aa', 'b', '4', '3', '2', 'dfd', np.nan, '1', 'null', 'ee',
            'NaN', 'ff', 'nan', 'gg', 'None'
        ]

        profile = CategoricalColumn("Name")
        profile.update(df1)

        profile2 = CategoricalColumn("Name")
        profile2.update(df2)

        # Add profiles
        profile3 = profile + profile2
        self.assertCountEqual(expected_categories, profile3.categories)
        self.assertEqual(profile3.sample_size,
                         profile.sample_size + profile2.sample_size)
        self.assertEqual(profile3.is_match, False)

        # Add again
        profile3 = profile + profile3
        self.assertCountEqual(expected_categories, profile3.categories)
        self.assertEqual(profile3.sample_size, 33)

        # Check is_match and unique_ratio if the sample size was small
        self.assertEqual(profile3.is_match, False)
        self.assertEqual(profile3.unique_ratio, 16 / 33)

        # Check is_match and unique_ratio if the sample size was large
        profile3.sample_size = 1000
        self.assertEqual(profile3.is_match, True)
        self.assertEqual(profile3.unique_ratio, 16 / 1000)

Ejemplo n.º 11

0

Mostrar archivo

Archivo: test_categorical_column_profile.py Proyecto: bballamudi/DataProfiler

    def test_timeit_profile(self):
        dataset = self.aws_dataset["host"].dropna()
        profile = CategoricalColumn(dataset.name)

        time_array = [float(x) for x in range(17, 0, -1)]
        with patch('time.time', side_effect=lambda: time_array.pop()):
            # Validate the time in the column class is empty.
            self.assertEqual(defaultdict(float), profile.profile['times'])

            # Validate the time in the column class has the expected time.
            profile.update(dataset)
            expected = defaultdict(float, {'categories': 1.0})
            self.assertEqual(expected, profile.profile['times'])

            # Validate expected time after second update to profile
            profile.update(dataset)
            expected = defaultdict(float, {'categories': 2.0})
            self.assertEqual(expected, profile.profile['times'])

Ejemplo n.º 12

0

Mostrar archivo

    def test_categorical_merge(self):
        df1 = pd.Series([
            "abcd", "aa", "abcd", "aa", "b", "4", "3", "2", "dfd", "2", np.nan
        ])
        df2 = pd.Series([
            "1", "null", "ee", "NaN", "ff", "nan", "gg", "None", "aa", "b",
            "ee"
        ])

        # Expected is based off insertion order
        expected_categories = [
            "abcd",
            "aa",
            "b",
            "4",
            "3",
            "2",
            "dfd",
            np.nan,
            "1",
            "null",
            "ee",
            "NaN",
            "ff",
            "nan",
            "gg",
            "None",
        ]

        profile = CategoricalColumn("Name")
        profile.update(df1)

        expected_dict = {
            "abcd": 2,
            "aa": 2,
            "b": 1,
            "4": 1,
            "3": 1,
            "2": 2,
            "dfd": 1,
            np.nan: 1,
        }
        self.assertDictEqual(expected_dict, profile._categories)

        profile2 = CategoricalColumn("Name")
        profile2.update(df2)

        # Add profiles
        profile3 = profile + profile2
        self.assertCountEqual(expected_categories, profile3.categories)
        self.assertEqual(profile3.sample_size,
                         profile.sample_size + profile2.sample_size)
        self.assertEqual(profile3.is_match, False)
        expected_dict = {
            "abcd": 2,
            "aa": 3,
            "b": 2,
            "4": 1,
            "3": 1,
            "2": 2,
            np.nan: 1,
            "dfd": 1,
            "1": 1,
            "ee": 2,
            "ff": 1,
            "gg": 1,
            "NaN": 1,
            "None": 1,
            "nan": 1,
            "null": 1,
        }
        self.assertDictEqual(expected_dict, profile3._categories)

        report = profile3.profile
        self.assertIsNotNone(report.pop("times", None))
        expected_profile = dict(
            categorical=False,
            statistics=dict([("unique_count", 16), ("unique_ratio", 16 / 22)]),
        )
        self.assertEqual(report, expected_profile)

        # Add again
        profile3 = profile + profile3
        self.assertCountEqual(expected_categories, profile3.categories)
        self.assertEqual(profile3.sample_size, 33)

        # Check is_match and unique_ratio if the sample size was small
        self.assertEqual(profile3.is_match, False)
        self.assertEqual(profile3.unique_ratio, 16 / 33)

        report = profile3.profile
        self.assertIsNotNone(report.pop("times", None))
        expected_profile = dict(
            categorical=False,
            statistics=dict([
                ("unique_count", 16),
                ("unique_ratio", 16 / 33),
            ]),
        )
        self.assertEqual(report, expected_profile)

        # Check is_match and unique_ratio if the sample size was large
        profile3.sample_size = 1000
        self.assertEqual(profile3.is_match, True)
        self.assertEqual(profile3.unique_ratio, 16 / 1000)

        report = profile3.profile
        self.assertIsNotNone(report.pop("times", None))
        report_categories = report["statistics"].pop("categories")
        report_count = report["statistics"].pop("categorical_count")
        report_gini = report["statistics"].pop("gini_impurity")
        expected_profile = dict(
            categorical=True,
            statistics=dict([
                ("unique_count", 16),
                ("unique_ratio", 16 / 1000),
                ("unalikeability", 32907 / (1000000 - 1000)),
            ]),
        )
        expected_gini = ((1 * ((5 / 1000) * (995 / 1000))) +
                         (2 * ((4 / 1000) * (996 / 1000))) +
                         (1 * ((3 / 1000) * (997 / 1000))) +
                         (5 * ((2 / 1000) * (998 / 1000))) +
                         (7 * ((1 / 1000) * (999 / 1000))))
        self.assertAlmostEqual(report_gini, expected_gini)
        self.assertEqual(report, expected_profile)
        self.assertCountEqual(
            report_categories,
            [
                "abcd",
                "aa",
                "2",
                np.nan,
                "4",
                "b",
                "3",
                "dfd",
                "ee",
                "ff",
                "nan",
                "None",
                "1",
                "gg",
                "null",
                "NaN",
            ],
        )
        expected_dict = {
            "aa": 5,
            "2": 4,
            "abcd": 4,
            "b": 3,
            np.nan: 2,
            "dfd": 2,
            "3": 2,
            "4": 2,
            "ee": 2,
            "null": 1,
            "ff": 1,
            "NaN": 1,
            "1": 1,
            "nan": 1,
            "gg": 1,
            "None": 1,
        }
        self.assertCountEqual(report_count, expected_dict)

Ejemplo n.º 13

0

Mostrar archivo

Archivo: test_categorical_column_profile.py Proyecto: JGSweets/data-profiler

    def test_gini_impurity(self):
        # Normal test
        df_categorical = pd.Series(["y", "y", "y", "y", "n", "n", "n"])
        profile = CategoricalColumn(df_categorical.name)
        profile.update(df_categorical)
        expected_val = ((4 / 7) * (3 / 7)) + ((4 / 7) * (3 / 7))
        self.assertAlmostEqual(profile.gini_impurity, expected_val)

        # One class only test
        df_categorical = pd.Series(["y", "y", "y", "y", "y", "y", "y"])
        profile = CategoricalColumn(df_categorical.name)
        profile.update(df_categorical)
        expected_val = 0
        self.assertEqual(profile.gini_impurity, expected_val)

        # Empty test
        df_categorical = pd.Series([])
        profile = CategoricalColumn(df_categorical.name)
        profile.update(df_categorical)
        self.assertEqual(profile.gini_impurity, None)

Ejemplo n.º 14

0

Mostrar archivo

    def test_categorical_diff(self):
        df_categorical = pd.Series(["y", "y", "y", "y", "n", "n", "n"])
        profile = CategoricalColumn(df_categorical.name)
        profile.update(df_categorical)

        df_categorical = pd.Series(["y", "maybe", "y", "y", "n", "n", "maybe"])
        profile2 = CategoricalColumn(df_categorical.name)
        profile2.update(df_categorical)

        # chi2-statistic = sum((observed-expected)^2/expected for each category in each column)
        # df = categories - 1
        # p-value found through using chi2 CDF
        expected_diff = {
            "categorical": "unchanged",
            "statistics": {
                "unique_count": -1,
                "unique_ratio": -0.14285714285714285,
                "categories": [[], ["y", "n"], ["maybe"]],
                "gini_impurity": -0.16326530612244894,
                "unalikeability": -0.19047619047619047,
                "categorical_count": {
                    "y": 1,
                    "n": 1,
                    "maybe": [None, 2]
                },
            },
            "chi2-test": {
                "chi2-statistic": 82 / 35,
                "df": 2,
                "p-value": 0.3099238764710244,
            },
        }

        self.assertDictEqual(expected_diff, profile.diff(profile2))

        # Test with one categorical column matching
        df_not_categorical = pd.Series([
            "THIS",
            "is",
            "not",
            "a",
            "categorical",
            "column",
            "for",
            "testing",
            "purposes",
            "Bada",
            "Bing",
            "Badaboom",
        ])
        profile2 = CategoricalColumn(df_not_categorical.name)
        profile2.update(df_not_categorical)
        expected_diff = {
            "categorical": [True, False],
            "statistics": {
                "unique_count": -10,
                "unique_ratio": -0.7142857142857143
            },
        }
        self.assertDictEqual(expected_diff, profile.diff(profile2))

Ejemplo n.º 15

0

Mostrar archivo

    def test_mixed_categorical_col_integer_string(self):
        dataset = self.aws_dataset["localeabbr"].dropna()
        profile = CategoricalColumn(dataset.name)
        profile.update(dataset)

        categories = {
            "36",
            "OR",
            "IL",
            "41",
            "51",
            "13",
            "21",
            "WA",
            "11",
            "CA",
            "37",
            "TX",
            "10",
            "SPE",
            "34",
            "32",
            "35",
            "23",
            "NM",
            "NV",
            "33",
            "44",
            "22",
            "GR",
            "15",
            "MI",
            "43",
            "FL",
            "TA",
            "KY",
            "SP",
            "SE",
            "AZ",
            "42",
            "NJ",
            "DC",
            "77",
            "50",
            "NGR",
            "31",
            "DIF",
            "61",
            "45",
            "NY",
            "MH",
            "ALT",
            "CH",
            "NSW",
            "MS",
            "81",
            "GP",
            "KU",
            "14",
            "53",
            "64",
            "AP",
            "38",
            "IRK",
            "CL",
            "TXG",
            "LUA",
            "ANT",
            "PA",
            "QC",
            "RS",
            "MO",
            "C",
            "MOW",
            "ENG",
            "ON",
            "CE",
            "TN",
            "PI",
            "VLG",
            "DL",
            "VL",
            "GE",
            "WP",
            "GO",
            "BS",
            "KEM",
            "MA",
            "BEL",
            "LB",
            "CU",
            "EC",
            "PB",
            "RIX",
            "B",
            "RJ",
            "VA",
            "7",
            "SL",
            "BE",
            "47",
            "RM",
            "BIH",
            "SD",
            "OH",
            "PR",
            "M",
            "SN",
            "COR",
            "63",
            "E",
            "BD",
            "VI",
            "SAM",
            "BA",
            "WY",
            "62",
            "4",
            "PER",
            "WKO",
            "KYA",
            "6",
            "MN",
            "SA",
            "8",
            "CO",
            "IS",
            "RIS",
            "FS",
            "IN",
            "LIV",
            "IA",
            "24",
            "VIC",
            "27",
            "16",
            "PK",
            "WB",
            "NH",
            "DAS",
            "CT",
            "CN",
            "BIR",
            "NVS",
            "MG",
            "3",
            "PH",
            "TO",
            "1",
            "HE",
            "VGG",
            "BU",
            "AB",
            "NIZ",
            "92",
            "46",
            "MZ",
            "FR",
        }

        self.assertEqual(2120, profile.sample_size)
        six.assertCountEqual(self, categories, profile.categories)

Ejemplo n.º 16

0

Mostrar archivo

Archivo: test_categorical_column_profile.py Proyecto: JGSweets/data-profiler

    def test_categorical_merge(self):
        df1 = pd.Series([
            "abcd", "aa", "abcd", "aa", "b", "4", "3", "2", "dfd", "2", np.nan
        ])
        df2 = pd.Series([
            "1", "null", "ee", "NaN", "ff", "nan", "gg", "None", "aa", "b",
            "ee"
        ])

        # Expected is based off insertion order
        expected_categories = [
            'abcd', 'aa', 'b', '4', '3', '2', 'dfd', np.nan, '1', 'null', 'ee',
            'NaN', 'ff', 'nan', 'gg', 'None'
        ]

        profile = CategoricalColumn("Name")
        profile.update(df1)

        expected_dict = {
            "abcd": 2,
            "aa": 2,
            "b": 1,
            "4": 1,
            "3": 1,
            "2": 2,
            "dfd": 1,
            np.nan: 1
        }
        self.assertDictEqual(expected_dict, profile._categories)

        profile2 = CategoricalColumn("Name")
        profile2.update(df2)

        # Add profiles
        profile3 = profile + profile2
        self.assertCountEqual(expected_categories, profile3.categories)
        self.assertEqual(profile3.sample_size,
                         profile.sample_size + profile2.sample_size)
        self.assertEqual(profile3.is_match, False)
        expected_dict = {
            "abcd": 2,
            "aa": 3,
            "b": 2,
            "4": 1,
            "3": 1,
            "2": 2,
            np.nan: 1,
            "dfd": 1,
            "1": 1,
            "ee": 2,
            "ff": 1,
            "gg": 1,
            "NaN": 1,
            "None": 1,
            "nan": 1,
            "null": 1
        }
        self.assertDictEqual(expected_dict, profile3._categories)

        report = profile3.profile
        self.assertIsNotNone(report.pop("times", None))
        expected_profile = dict(
            categorical=False,
            statistics=dict([('unique_count', 16), ('unique_ratio', 16 / 22)]),
        )
        self.assertEqual(report, expected_profile)

        # Add again
        profile3 = profile + profile3
        self.assertCountEqual(expected_categories, profile3.categories)
        self.assertEqual(profile3.sample_size, 33)

        # Check is_match and unique_ratio if the sample size was small
        self.assertEqual(profile3.is_match, False)
        self.assertEqual(profile3.unique_ratio, 16 / 33)

        report = profile3.profile
        self.assertIsNotNone(report.pop("times", None))
        expected_profile = dict(
            categorical=False,
            statistics=dict([
                ('unique_count', 16),
                ('unique_ratio', 16 / 33),
            ]),
        )
        self.assertEqual(report, expected_profile)

        # Check is_match and unique_ratio if the sample size was large
        profile3.sample_size = 1000
        self.assertEqual(profile3.is_match, True)
        self.assertEqual(profile3.unique_ratio, 16 / 1000)

        report = profile3.profile
        self.assertIsNotNone(report.pop("times", None))
        report_categories = report['statistics'].pop('categories')
        report_count = report['statistics'].pop('categorical_count')
        report_gini = report['statistics'].pop('gini_impurity')
        expected_profile = dict(categorical=True,
                                statistics=dict([('unique_count', 16),
                                                 ('unique_ratio', 16 / 1000),
                                                 ('unalikeability',
                                                  32907 / (1000000 - 1000))]))
        expected_gini = (1*((5/1000) * (995/1000))) + \
                        (2*((4/1000) * (996/1000))) + \
                        (1*((3/1000) * (997/1000))) + \
                        (5*((2/1000) * (998/1000))) + \
                        (7*((1/1000) * (999/1000)))
        self.assertAlmostEqual(report_gini, expected_gini)
        self.assertEqual(report, expected_profile)
        self.assertCountEqual(report_categories, [
            'abcd', 'aa', '2', np.nan, '4', 'b', '3', 'dfd', 'ee', 'ff', 'nan',
            'None', '1', 'gg', 'null', 'NaN'
        ])
        expected_dict = {
            'aa': 5,
            '2': 4,
            'abcd': 4,
            'b': 3,
            np.nan: 2,
            'dfd': 2,
            '3': 2,
            '4': 2,
            'ee': 2,
            'null': 1,
            'ff': 1,
            'NaN': 1,
            '1': 1,
            'nan': 1,
            'gg': 1,
            'None': 1
        }
        self.assertCountEqual(report_count, expected_dict)

Ejemplo n.º 17

0

Mostrar archivo

Archivo: test_categorical_column_profile.py Proyecto: JGSweets/data-profiler

    def test_categorical_diff(self):
        df_categorical = pd.Series(["y", "y", "y", "y", "n", "n", "n"])
        profile = CategoricalColumn(df_categorical.name)
        profile.update(df_categorical)

        df_categorical = pd.Series(["y", "maybe", "y", "y", "n", "n", "maybe"])
        profile2 = CategoricalColumn(df_categorical.name)
        profile2.update(df_categorical)

        # chi2-statistic = sum((observed-expected)^2/expected for each category in each column)
        # df = categories - 1
        # p-value found through using chi2 CDF
        expected_diff = {
            'categorical': 'unchanged',
            'statistics': {
                'unique_count': -1,
                'unique_ratio': -0.14285714285714285,
                'categories': [[], ['y', 'n'], ['maybe']],
                'gini_impurity': -0.16326530612244894,
                'unalikeability': -0.19047619047619047,
                'categorical_count': {
                    'y': 1,
                    'n': 1,
                    'maybe': [None, 2]
                }
            },
            'chi2-test': {
                'chi2-statistic': 82 / 35,
                'df': 2,
                'p-value': 0.3099238764710244
            }
        }

        self.assertDictEqual(expected_diff, profile.diff(profile2))

        # Test with one categorical column matching
        df_not_categorical = pd.Series([
            "THIS", "is", "not", "a", "categorical", "column", "for",
            "testing", "purposes", "Bada", "Bing", "Badaboom"
        ])
        profile2 = CategoricalColumn(df_not_categorical.name)
        profile2.update(df_not_categorical)
        expected_diff = {
            'categorical': [True, False],
            'statistics': {
                'unique_count': -10,
                'unique_ratio': -0.7142857142857143
            }
        }
        self.assertDictEqual(expected_diff, profile.diff(profile2))

Ejemplo n.º 18

0

Mostrar archivo

Archivo: test_categorical_column_profile.py Proyecto: JGSweets/data-profiler

    def test_unalikeability(self):
        df_categorical = pd.Series(["a", "a"])
        profile = CategoricalColumn(df_categorical.name)
        profile.update(df_categorical)
        self.assertEqual(profile.unalikeability, 0)

        df_categorical = pd.Series(["a", "c", "b"])
        profile = CategoricalColumn(df_categorical.name)
        profile.update(df_categorical)
        self.assertEqual(profile.unalikeability, 1)

        df_categorical = pd.Series(["a", "a", "a", "b", "b", "b"])
        profile = CategoricalColumn(df_categorical.name)
        profile.update(df_categorical)
        self.assertEqual(profile.unalikeability, 18 / 30)

        df_categorical = pd.Series(
            ["a", "a", "b", "b", "b", "a", "c", "c", "a", "a"])
        profile = CategoricalColumn(df_categorical.name)
        profile.update(df_categorical)
        self.assertEqual(profile.unalikeability, 2 * (10 + 15 + 6) / 90)

        df_categorical = pd.Series(["a"])
        profile = CategoricalColumn(df_categorical.name)
        profile.update(df_categorical)
        self.assertEqual(0, profile.unalikeability)

        df_categorical = pd.Series([])
        profile = CategoricalColumn(df_categorical.name)
        profile.update(df_categorical)
        self.assertEqual(None, profile.unalikeability)