def test_option_timing(self):
        data = [2.0, 12.5, 'not a float', 6.0, 'not a float']
        df = pd.Series(data).apply(str)

        options = IntOptions()
        options.set({"min.is_enabled": False})

        profiler = IntColumn(df.name, options=options)

        time_array = [float(i) for i in range(100, 0, -1)]
        with mock.patch('time.time', side_effect=lambda: time_array.pop()):
            # Validate that the times dictionary is empty
            self.assertCountEqual(defaultdict(float),
                                  profiler.profile['times'])
            profiler.update(df)

            # Validate the time in the datetime class has the expected time.
            profile = profiler.profile

            expected = defaultdict(float, {'max': 1.0, 'sum': 1.0, 'variance': 1.0, \
                                           'histogram_and_quantiles': 1.0})
            self.assertCountEqual(expected, profile['times'])

            # Validate time in datetime class has expected time after second update
            profiler.update(df)
            expected = defaultdict(float, {'max': 2.0, 'sum': 2.0, 'variance': 2.0, \
                                           'histogram_and_quantiles': 2.0})
            self.assertCountEqual(expected, profiler.profile['times'])
    def test_profile_merge_bin_edges_indices(self):
        vals = [
            4948484949555554544949495054485054,
            4948484948485749515554495054485054,
            4948484948505251545552524952485054,
            4948484952485048485551524952485054,
            4948484948515550575556535154485054,
            4948484950545549485651495054485054,
            4948484954565649505449524950485054,
            49484849535456545155495054485054,
            4948484954515651515451495054485054,
            4948484957575651505156554954485054
        ]

        data = pd.Series(vals)
        data_1 = data[:5]
        data_2 = data[5:]

        options = IntOptions()

        options.set({"histogram_and_quantiles.is_enabled": True})

        profile_1 = IntColumn("Int", options=options)
        profile_2 = IntColumn("Int", options=options)

        profile_1.update(data_1)
        profile_2.update(data_2)

        profile_1 + profile_2
    def test_profile_merge_with_different_options(self):
        # Creating first profiler with default options
        options = IntOptions()
        options.max.is_enabled = False
        options.min.is_enabled = False

        data = [2, 4, 6, 8]
        df = pd.Series(data).apply(str)
        profiler1 = IntColumn("Int", options=options)
        profiler1.update(df)
        profiler1.match_count = 0

        # Creating second profiler with separate options
        options = IntOptions()
        options.min.is_enabled = False
        data2 = [10, 15]
        df2 = pd.Series(data2).apply(str)
        profiler2 = IntColumn("Int", options=options)
        profiler2.update(df2)

        # Asserting warning when adding 2 profilers with different options
        with self.assertWarnsRegex(
                RuntimeWarning, "max is disabled because it is not enabled in"
                " both profiles."):
            profiler3 = profiler1 + profiler2

        # Assert that these features are still merged
        profile = profiler3.profile
        self.assertIsNotNone(profiler3.histogram_selection)
        self.assertIsNotNone(profile['variance'])
        self.assertIsNotNone(profiler3.sum)

        # Assert that these features are not calculated
        self.assertIsNone(profiler3.max)
        self.assertIsNone(profiler3.min)
Beispiel #4
0
 def test_invalid_options_type(self, *mocks):
     # Test incorrect data labeler options
     options = ProfilerOptions()
     options.structured_options.data_labeler = IntOptions()
     with self.assertRaisesRegex(
             ValueError, "data_labeler must be a\(n\) DataLabelerOptions."):
         profile = Profiler(self.data, profiler_options=options)
     # Test incorrect float options
     options = ProfilerOptions()
     options.structured_options.float = IntOptions()
     with self.assertRaisesRegex(ValueError,
                                 "float must be a\(n\) FloatOptions."):
         profile = Profiler(self.data, profiler_options=options)
Beispiel #5
0
    def test_option_timing(self):
        data = [2.0, 12.5, "not a float", 6.0, "not a float"]
        df = pd.Series(data).apply(str)

        options = IntOptions()
        options.set({"min.is_enabled": False})

        profiler = IntColumn(df.name, options=options)

        time_array = [float(i) for i in range(100, 0, -1)]
        with mock.patch("time.time", side_effect=lambda: time_array.pop()):
            # Validate that the times dictionary is empty
            self.assertCountEqual(defaultdict(float),
                                  profiler.profile["times"])
            profiler.update(df)

            # Validate the time in the datetime class has the expected time.
            profile = profiler.profile

            expected = defaultdict(
                float,
                {
                    "max": 1.0,
                    "sum": 1.0,
                    "variance": 1.0,
                    "skewness": 1.0,
                    "kurtosis": 1.0,
                    "num_zeros": 1.0,
                    "num_negatives": 1.0,
                    "histogram_and_quantiles": 1.0,
                },
            )
            self.assertCountEqual(expected, profile["times"])

            # Validate time in datetime class has expected time after second update
            profiler.update(df)
            expected = defaultdict(
                float,
                {
                    "max": 2.0,
                    "sum": 2.0,
                    "variance": 2.0,
                    "skewness": 2.0,
                    "kurtosis": 2.0,
                    "num_zeros": 2.0,
                    "num_negatives": 2.0,
                    "histogram_and_quantiles": 2.0,
                },
            )
            self.assertCountEqual(expected, profiler.profile["times"])
Beispiel #6
0
 def test_invalid_options_type(self, *mocks):
     # Test incorrect data labeler options
     options = ProfilerOptions()
     options.structured_options.data_labeler = IntOptions()
     with self.assertRaisesRegex(
             ValueError, "DataLabelerColumn parameter 'options' must be of "
             "type DataLabelerOptions."):
         profile = Profiler(self.data, profiler_options=options)
     # Test incorrect float options
     options = ProfilerOptions()
     options.structured_options.float = IntOptions()
     with self.assertRaisesRegex(
             ValueError, "FloatColumn parameter 'options' must be of type "
             "FloatOptions."):
         profile = Profiler(self.data, profiler_options=options)
    def test_custom_bin_count_merge(self):

        options = IntOptions()
        options.histogram_and_quantiles.bin_count_or_method = 10

        data = [2, 'not an int', 6, 'not an int']
        df = pd.Series(data).apply(str)
        profiler1 = IntColumn("Int", options)
        profiler1.update(df)

        data2 = [10, 'not an int', 15, 'not an int']
        df2 = pd.Series(data2).apply(str)
        profiler2 = IntColumn("Int", options)
        profiler2.update(df2)

        # no warning should occur
        import warnings
        with warnings.catch_warnings(record=True) as w:
            merge_profile = profiler1 + profiler2
        self.assertListEqual([], w)
        self.assertEqual(10, merge_profile.user_set_histogram_bin)

        # make bin counts different and get warning
        profiler2.user_set_histogram_bin = 120
        with self.assertWarnsRegex(
                UserWarning, 'User set histogram bin counts did not '
                'match. Choosing the larger bin count.'):
            merged_profile = profiler1 + profiler2
        self.assertEqual(120, merged_profile.user_set_histogram_bin)
    def test_bias_correction_option(self):
        data = np.linspace(-5, 5, 11).tolist()
        df1 = pd.Series(data)

        data = np.linspace(-3, 2, 11).tolist()
        df2 = pd.Series(data)

        data = np.full((10, ), 1)
        df3 = pd.Series(data)

        # Disable bias correction
        options = IntOptions()
        options.bias_correction.is_enabled = False
        num_profiler = IntColumn(df1.name, options=options)
        num_profiler.update(df1.apply(str))
        self.assertAlmostEqual(10, num_profiler.variance)
        self.assertAlmostEqual(0, num_profiler.skewness)
        self.assertAlmostEqual(89 / 50 - 3, num_profiler.kurtosis)

        df2_ints = df2[df2 == df2.round()]
        num_profiler.update(df2.apply(str))
        df = pd.concat([df1, df2_ints])
        self.assertAlmostEqual(2184 / 289, num_profiler.variance)
        self.assertAlmostEqual(165 * np.sqrt(3 / 182) / 182,
                               num_profiler.skewness)
        self.assertAlmostEqual(60769 / 28392 - 3, num_profiler.kurtosis)

        df3_ints = df3[df3 == df3.round()]
        num_profiler.update(df3.apply(str))
        df = pd.concat([df1, df2_ints, df3_ints])
        self.assertAlmostEqual(3704 / 729, num_profiler.variance)
        self.assertAlmostEqual(-11315 / (926 * np.sqrt(926)),
                               num_profiler.skewness)
        self.assertAlmostEqual(5305359 / 1714952 - 3, num_profiler.kurtosis)
    def test_profiled_mode(self):
        # disabled mode
        df = pd.Series([1, 1, 1, 1, 1, 1, 1]).apply(str)
        options = IntOptions()
        options.mode.is_enabled = False
        profiler = IntColumn(df.name, options)
        profiler.update(df)
        self.assertListEqual([np.nan], profiler.mode)

        # same values
        df = pd.Series([1, 1, 1, 1, 1, 1, 1]).apply(str)
        profiler = IntColumn(df.name)
        profiler.update(df)
        self.assertListEqual([1], profiler.mode)

        # multiple modes
        df = pd.Series([1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6]).apply(str)
        profiler = IntColumn(df.name)
        profiler.update(df)
        np.testing.assert_array_almost_equal([1, 2, 3, 4, 5],
                                             profiler.mode,
                                             decimal=2)

        # with different values
        df = pd.Series([1, 1, 1, 1, 2]).apply(str)
        profiler = IntColumn(df.name)
        profiler.update(df)
        np.testing.assert_array_almost_equal([1], profiler.mode, decimal=2)

        # with negative values
        df = pd.Series([-1, 1, 1, 1, 2, 2, 2])
        profiler = IntColumn(df.name)
        profiler.update(df)
        np.testing.assert_array_almost_equal([1, 2], profiler.mode, decimal=2)

        # all unique values
        df = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).apply(str)
        profiler = IntColumn(df.name)
        profiler.update(df)
        # By default, returns 5 of the possible modes
        np.testing.assert_array_almost_equal([1, 2, 3, 4, 5],
                                             profiler.mode,
                                             decimal=2)

        # Edge case where mode appears later in the dataset
        df = pd.Series([1, 2, 3, 4, 5, 6, 6]).apply(str)
        profiler = IntColumn(df.name)
        profiler.update(df)
        np.testing.assert_array_almost_equal([6], profiler.mode, decimal=2)

        df = pd.Series([1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7,
                        7]).apply(str)
        profiler = IntColumn(df.name)
        profiler.update(df)
        np.testing.assert_array_almost_equal([7], profiler.mode, decimal=2)
    def test_top_k_modes(self):
        # Default options
        options = IntOptions()
        df = pd.Series([1, 1, 2, 2, 3, 3, 4, 4, 5, 5]).apply(str)
        profiler = IntColumn(df.name, options)
        profiler.update(df)
        self.assertEqual(5, len(profiler.mode))

        # Test if top_k_modes is less than the number of modes
        options = IntOptions()
        options.mode.top_k_modes = 2
        df = pd.Series([1, 1, 2, 2, 3, 3, 4, 4, 5, 5]).apply(str)
        profiler = IntColumn(df.name, options)
        profiler.update(df)
        self.assertEqual(2, len(profiler.mode))

        # Test if top_k_mode is greater than the number of modes
        options = IntOptions()
        options.mode.top_k_modes = 8
        df = pd.Series([1, 1, 2, 2, 3, 3, 4, 4, 5, 5]).apply(str)
        profiler = IntColumn(df.name, options)
        profiler.update(df)
        # Only 5 possible modes so return 5
        self.assertEqual(5, len(profiler.mode))
    def test_bias_correction_merge(self):
        data = np.linspace(-5, 5, 11).tolist()
        df1 = pd.Series(data)

        data = np.linspace(-3, 2, 11).tolist()
        df2 = pd.Series(data)

        data = np.full((10, ), 1)
        df3 = pd.Series(data)

        # Disable bias correction
        options = IntOptions()
        options.bias_correction.is_enabled = False
        num_profiler1 = IntColumn(df1.name, options=options)
        num_profiler1.update(df1.apply(str))
        self.assertAlmostEqual(10, num_profiler1.variance)
        self.assertAlmostEqual(0, num_profiler1.skewness)
        self.assertAlmostEqual(89 / 50 - 3, num_profiler1.kurtosis)

        df2_ints = df2[df2 == df2.round()]
        num_profiler2 = IntColumn(df2.name)
        num_profiler2.update(df2.apply(str))
        num_profiler_merged = num_profiler1 + num_profiler2
        # Values should stay biased values
        self.assertFalse(num_profiler_merged.bias_correction)
        self.assertAlmostEqual(2184 / 289, num_profiler_merged.variance)
        self.assertAlmostEqual(165 * np.sqrt(3 / 182) / 182,
                               num_profiler_merged.skewness)
        self.assertAlmostEqual(60769 / 28392 - 3, num_profiler_merged.kurtosis)

        df3_ints = df3[df3 == df3.round()]
        num_profiler3 = IntColumn(df3.name)
        num_profiler3.update(df3.apply(str))
        num_profiler_merged = num_profiler1 + num_profiler2 + num_profiler3
        self.assertFalse(num_profiler_merged.bias_correction)
        self.assertAlmostEqual(3704 / 729, num_profiler_merged.variance)
        self.assertAlmostEqual(-11315 / (926 * np.sqrt(926)),
                               num_profiler_merged.skewness)
        self.assertAlmostEqual(5305359 / 1714952 - 3,
                               num_profiler_merged.kurtosis)
    def test_profiled_median(self):
        # disabled median
        df = pd.Series([1, 1, 1, 1, 1, 1, 1]).apply(str)
        options = IntOptions()
        options.median.is_enabled = False
        profiler = IntColumn(df.name, options)
        profiler.update(df)
        self.assertTrue(profiler.median is np.nan)

        # same values
        df = pd.Series([1, 1, 1, 1, 1, 1, 1]).apply(str)
        profiler = IntColumn(df.name)
        profiler.update(df)
        self.assertEqual(1, profiler.median)

        # median lies between two values s
        df = pd.Series([1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6]).apply(str)
        profiler = IntColumn(df.name)
        profiler.update(df)
        self.assertAlmostEqual(3.5, profiler.median, places=2)

        # with different values
        df = pd.Series([1, 1, 1, 1, 2]).apply(str)
        profiler = IntColumn(df.name)
        profiler.update(df)
        self.assertAlmostEqual(1, profiler.median, places=2)

        # with negative values
        df = pd.Series([-1, 1, 1, 1, 2, 2, 2])
        profiler = IntColumn(df.name)
        profiler.update(df)
        self.assertAlmostEqual(1, profiler.median, places=2)

        # all unique values
        df = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).apply(str)
        profiler = IntColumn(df.name)
        profiler.update(df)
        self.assertAlmostEqual(5.5, profiler.median, places=2)
    def test_histogram_option_integration(self):
        # test setting bin methods
        options = IntOptions()
        options.histogram_and_quantiles.bin_count_or_method = "sturges"
        num_profiler = IntColumn(name="test", options=options)
        self.assertIsNone(num_profiler.histogram_selection)
        self.assertEqual(["sturges"], num_profiler.histogram_bin_method_names)

        options.histogram_and_quantiles.bin_count_or_method = [
            "sturges", "doane"
        ]
        num_profiler = IntColumn(name="test2", options=options)
        self.assertIsNone(num_profiler.histogram_selection)
        self.assertEqual(["sturges", "doane"],
                         num_profiler.histogram_bin_method_names)

        options.histogram_and_quantiles.bin_count_or_method = 100
        num_profiler = IntColumn(name="test3", options=options)
        self.assertIsNone(num_profiler.histogram_selection)
        self.assertEqual(['custom'], num_profiler.histogram_bin_method_names)

        # case when just 1 unique value, should just set bin size to be 1
        num_profiler.update(pd.Series(['1', '1']))
        self.assertEqual(
            1,
            len(num_profiler.histogram_methods['custom']['histogram']
                ['bin_counts']))

        # case when more than 1 unique value, by virtue of a streaming update
        num_profiler.update(pd.Series(['2']))
        self.assertEqual(
            100,
            len(num_profiler._stored_histogram['histogram']['bin_counts']))

        histogram, _ = num_profiler._histogram_for_profile('custom')
        self.assertEqual(100, len(histogram['bin_counts']))