Beispiel #1
0
    def test_option_timing(self):
        data = [2.0, 12.5, 'not a float', 6.0, 'not a float']
        df = pd.Series(data).apply(str)

        options = FloatOptions()
        options.set({"min.is_enabled": False})

        profiler = FloatColumn(df.name, options=options)

        time_array = [float(i) for i in range(100, 0, -1)]
        with mock.patch('time.time', side_effect=lambda: time_array.pop()):
            # Validate that the times dictionary is empty
            self.assertEqual(defaultdict(float), profiler.profile['times'])
            profiler.update(df)

            # Validate the time in the datetime class has the expected time.
            profile = profiler.profile

            expected = defaultdict(float, {'max': 1.0, 'sum': 1.0,\
                                           'variance': 1.0, 'precision': 1.0,\
                                           'histogram_and_quantiles': 15.0})
            self.assertEqual(expected, profile['times'])

            # Validate time in datetime class has expected time after second update
            profiler.update(df)
            expected = defaultdict(float, {'max': 2.0, 'sum': 2.0,\
                                           'variance': 2.0, 'precision': 2.0,\
                                           'histogram_and_quantiles': 30.0})
            self.assertEqual(expected, profiler.profile['times'])
Beispiel #2
0
    def test_profile_merge_with_different_options(self):
        # Creating first profiler with default options
        options = FloatOptions()
        options.max.is_enabled = False
        options.min.is_enabled = False
        options.histogram_and_quantiles.bin_count_or_method = None

        data = [2, 4, 6, 8]
        df = pd.Series(data).apply(str)
        profiler1 = FloatColumn("Float", options=options)
        profiler1.update(df)

        # Creating second profiler with separate options
        options = FloatOptions()
        options.min.is_enabled = False
        options.precision.is_enabled = False
        options.histogram_and_quantiles.bin_count_or_method = None

        data2 = [10, 15]
        df2 = pd.Series(data2).apply(str)
        profiler2 = FloatColumn("Float", options=options)
        profiler2.update(df2)

        # Asserting warning when adding 2 profilers with different options
        with warnings.catch_warnings(record=True) as w:
            profiler3 = profiler1 + profiler2
            list_of_warning_messages = []
            for warning in w:
                list_of_warning_messages.append(str(warning.message))

            warning1 = ("precision is disabled because it is not enabled in "
                        "both profiles.")
            warning2 = ("max is disabled because it is not enabled in both "
                        "profiles.")
            self.assertIn(warning1, list_of_warning_messages)
            self.assertIn(warning2, list_of_warning_messages)

        # Assert that these features are still merged
        profile = profiler3.profile
        self.assertEqual("doane", profiler3.histogram_selection)
        self.assertEqual(21.5, profile['variance'])
        self.assertEqual(45.0, profiler3.sum)

        # Assert that these features are not calculated
        self.assertIsNone(profiler3.max)
        self.assertIsNone(profiler3.min)
        self.assertEqual(None, profiler3.precision['min'])
        self.assertEqual(None, profiler3.precision['max'])

        # Creating profiler with precision to 0.1
        options = FloatOptions()
        options.max.is_enabled = False
        options.min.is_enabled = False
        options.histogram_and_quantiles.method = None

        data = [2, 4, 6, 8]
        df = pd.Series(data).apply(str)
        profiler1 = FloatColumn("Float", options=options)
        profiler1.update(df)
    def test_setting_options(self, *mocks):
        options = ProfilerOptions()

        # Ensure set works appropriately
        options.set(
            {
                "data_labeler.is_enabled": False,
                "min.is_enabled": False,
                "structured_options.data_labeler.data_labeler_dirpath": "test",
                "data_labeler.max_sample_size": 15,
            }
        )

        text_options = options.structured_options.text.properties
        float_options = options.structured_options.float.properties
        int_options = options.structured_options.int.properties
        data_labeler_options = options.structured_options.data_labeler.properties

        self.assertFalse(options.structured_options.data_labeler.is_enabled)
        self.assertFalse(text_options["min"].is_enabled)
        self.assertFalse(float_options["min"].is_enabled)
        self.assertFalse(int_options["min"].is_enabled)
        self.assertEqual(data_labeler_options["data_labeler_dirpath"], "test")
        self.assertEqual(data_labeler_options["max_sample_size"], 15)

        # Ensure direct attribute setting works appropriately
        options.structured_options.data_labeler.max_sample_size = 12
        options.structured_options.text.histogram_and_quantiles.is_enabled = True
        options.structured_options.text.is_enabled = False

        text_options = options.structured_options.text.properties
        data_labeler_options = options.structured_options.data_labeler.properties
        self.assertEqual(data_labeler_options["max_sample_size"], 12)
        self.assertTrue(text_options["histogram_and_quantiles"].is_enabled)
        self.assertFalse(text_options["is_enabled"])

        # check direct attribute access after set
        float_options = FloatOptions()
        float_options.set(
            {
                "precision.is_enabled": False,
                "min.is_enabled": False,
                "*.is_enabled": False,
            }
        )

        self.assertFalse(float_options.precision.is_enabled)
        self.assertFalse(float_options.min.is_enabled)
        self.assertFalse(float_options.is_enabled)
Beispiel #4
0
    def test_custom_bin_count_merge(self):

        options = FloatOptions()
        options.histogram_and_quantiles.bin_count_or_method = 10

        data = [2.0, 'not a float', 6.0, 'not a float']
        df = pd.Series(data).apply(str)
        profiler1 = FloatColumn("Float", options)
        profiler1.update(df)

        data2 = [10.0, 'not a float', 15.0, 'not a float']
        df2 = pd.Series(data2).apply(str)
        profiler2 = FloatColumn("Float", options)
        profiler2.update(df2)

        # no warning should occur
        with warnings.catch_warnings(record=True) as w:
            merge_profile = profiler1 + profiler2
        self.assertListEqual([], w)
        self.assertEqual(10, merge_profile.user_set_histogram_bin)

        # make bin counts different and get warning
        profiler2.user_set_histogram_bin = 120
        with self.assertWarnsRegex(
                UserWarning, 'User set histogram bin counts did not '
                'match. Choosing the larger bin count.'):
            merged_profile = profiler1 + profiler2
        self.assertEqual(120, merged_profile.user_set_histogram_bin)
Beispiel #5
0
    def test_histogram_option_integration(self):
        # test setting bin methods
        options = FloatOptions()
        options.histogram_and_quantiles.bin_count_or_method = "sturges"
        num_profiler = FloatColumn(name="test", options=options)
        self.assertIsNone(num_profiler.histogram_selection)
        self.assertEqual(["sturges"], num_profiler.histogram_bin_method_names)

        options.histogram_and_quantiles.bin_count_or_method = [
            "sturges", "doane"
        ]
        num_profiler = FloatColumn(name="test2", options=options)
        self.assertIsNone(num_profiler.histogram_selection)
        self.assertEqual(["sturges", "doane"],
                         num_profiler.histogram_bin_method_names)

        # test histogram bin count set
        options.histogram_and_quantiles.bin_count_or_method = 100
        num_profiler = FloatColumn(name="test3", options=options)
        self.assertIsNone(num_profiler.histogram_selection)
        self.assertEqual(['custom'], num_profiler.histogram_bin_method_names)

        # case when just 1 unique value, should just set bin size to be 1
        num_profiler.update(pd.Series(['1', '1']))
        self.assertEqual(
            1,
            len(num_profiler.histogram_methods['custom']['histogram']
                ['bin_counts']))

        # case when more than 1 unique value, by virtue of a streaming update
        num_profiler.update(pd.Series(['2']))
        self.assertEqual(
            100,
            len(num_profiler._stored_histogram['histogram']['bin_counts']))

        histogram, _ = num_profiler._histogram_for_profile('custom')
        self.assertEqual(100, len(histogram['bin_counts']))
Beispiel #6
0
    def test_option_precision(self):
        data = [1.1, 2.2, 3.3, 4.4]
        df = pd.Series(data).apply(str)

        # Turn off precision
        options = FloatOptions()
        options.set({"precision.is_enabled": False})
        profiler = FloatColumn(df.name, options=options)
        profiler.update(df)
        self.assertEqual(None, profiler.precision['sample_size'])

        # Turn on precision, check sample_size
        options = FloatOptions()
        options.set({"precision.is_enabled": True})
        profiler = FloatColumn(df.name, options=options)
        profiler.update(df)
        self.assertEqual(4, profiler.precision['sample_size'])

        # Trun on precision, set 0.5 sample_size
        options = FloatOptions()
        options.set({"precision.sample_ratio": 0.5})
        profiler = FloatColumn(df.name, options=options)
        profiler.update(df)
        self.assertEqual(2, profiler.precision['sample_size'])