def test_option_timing(self): data = [2.0, 12.5, 'not a float', 6.0, 'not a float'] df = pd.Series(data).apply(str) options = FloatOptions() options.set({"min.is_enabled": False}) profiler = FloatColumn(df.name, options=options) time_array = [float(i) for i in range(100, 0, -1)] with mock.patch('time.time', side_effect=lambda: time_array.pop()): # Validate that the times dictionary is empty self.assertEqual(defaultdict(float), profiler.profile['times']) profiler.update(df) # Validate the time in the datetime class has the expected time. profile = profiler.profile expected = defaultdict(float, {'max': 1.0, 'sum': 1.0,\ 'variance': 1.0, 'precision': 1.0,\ 'histogram_and_quantiles': 15.0}) self.assertEqual(expected, profile['times']) # Validate time in datetime class has expected time after second update profiler.update(df) expected = defaultdict(float, {'max': 2.0, 'sum': 2.0,\ 'variance': 2.0, 'precision': 2.0,\ 'histogram_and_quantiles': 30.0}) self.assertEqual(expected, profiler.profile['times'])
def test_profile_merge_with_different_options(self): # Creating first profiler with default options options = FloatOptions() options.max.is_enabled = False options.min.is_enabled = False options.histogram_and_quantiles.bin_count_or_method = None data = [2, 4, 6, 8] df = pd.Series(data).apply(str) profiler1 = FloatColumn("Float", options=options) profiler1.update(df) # Creating second profiler with separate options options = FloatOptions() options.min.is_enabled = False options.precision.is_enabled = False options.histogram_and_quantiles.bin_count_or_method = None data2 = [10, 15] df2 = pd.Series(data2).apply(str) profiler2 = FloatColumn("Float", options=options) profiler2.update(df2) # Asserting warning when adding 2 profilers with different options with warnings.catch_warnings(record=True) as w: profiler3 = profiler1 + profiler2 list_of_warning_messages = [] for warning in w: list_of_warning_messages.append(str(warning.message)) warning1 = ("precision is disabled because it is not enabled in " "both profiles.") warning2 = ("max is disabled because it is not enabled in both " "profiles.") self.assertIn(warning1, list_of_warning_messages) self.assertIn(warning2, list_of_warning_messages) # Assert that these features are still merged profile = profiler3.profile self.assertEqual("doane", profiler3.histogram_selection) self.assertEqual(21.5, profile['variance']) self.assertEqual(45.0, profiler3.sum) # Assert that these features are not calculated self.assertIsNone(profiler3.max) self.assertIsNone(profiler3.min) self.assertEqual(None, profiler3.precision['min']) self.assertEqual(None, profiler3.precision['max']) # Creating profiler with precision to 0.1 options = FloatOptions() options.max.is_enabled = False options.min.is_enabled = False options.histogram_and_quantiles.method = None data = [2, 4, 6, 8] df = pd.Series(data).apply(str) profiler1 = FloatColumn("Float", options=options) profiler1.update(df)
def test_setting_options(self, *mocks): options = ProfilerOptions() # Ensure set works appropriately options.set( { "data_labeler.is_enabled": False, "min.is_enabled": False, "structured_options.data_labeler.data_labeler_dirpath": "test", "data_labeler.max_sample_size": 15, } ) text_options = options.structured_options.text.properties float_options = options.structured_options.float.properties int_options = options.structured_options.int.properties data_labeler_options = options.structured_options.data_labeler.properties self.assertFalse(options.structured_options.data_labeler.is_enabled) self.assertFalse(text_options["min"].is_enabled) self.assertFalse(float_options["min"].is_enabled) self.assertFalse(int_options["min"].is_enabled) self.assertEqual(data_labeler_options["data_labeler_dirpath"], "test") self.assertEqual(data_labeler_options["max_sample_size"], 15) # Ensure direct attribute setting works appropriately options.structured_options.data_labeler.max_sample_size = 12 options.structured_options.text.histogram_and_quantiles.is_enabled = True options.structured_options.text.is_enabled = False text_options = options.structured_options.text.properties data_labeler_options = options.structured_options.data_labeler.properties self.assertEqual(data_labeler_options["max_sample_size"], 12) self.assertTrue(text_options["histogram_and_quantiles"].is_enabled) self.assertFalse(text_options["is_enabled"]) # check direct attribute access after set float_options = FloatOptions() float_options.set( { "precision.is_enabled": False, "min.is_enabled": False, "*.is_enabled": False, } ) self.assertFalse(float_options.precision.is_enabled) self.assertFalse(float_options.min.is_enabled) self.assertFalse(float_options.is_enabled)
def test_custom_bin_count_merge(self): options = FloatOptions() options.histogram_and_quantiles.bin_count_or_method = 10 data = [2.0, 'not a float', 6.0, 'not a float'] df = pd.Series(data).apply(str) profiler1 = FloatColumn("Float", options) profiler1.update(df) data2 = [10.0, 'not a float', 15.0, 'not a float'] df2 = pd.Series(data2).apply(str) profiler2 = FloatColumn("Float", options) profiler2.update(df2) # no warning should occur with warnings.catch_warnings(record=True) as w: merge_profile = profiler1 + profiler2 self.assertListEqual([], w) self.assertEqual(10, merge_profile.user_set_histogram_bin) # make bin counts different and get warning profiler2.user_set_histogram_bin = 120 with self.assertWarnsRegex( UserWarning, 'User set histogram bin counts did not ' 'match. Choosing the larger bin count.'): merged_profile = profiler1 + profiler2 self.assertEqual(120, merged_profile.user_set_histogram_bin)
def test_histogram_option_integration(self): # test setting bin methods options = FloatOptions() options.histogram_and_quantiles.bin_count_or_method = "sturges" num_profiler = FloatColumn(name="test", options=options) self.assertIsNone(num_profiler.histogram_selection) self.assertEqual(["sturges"], num_profiler.histogram_bin_method_names) options.histogram_and_quantiles.bin_count_or_method = [ "sturges", "doane" ] num_profiler = FloatColumn(name="test2", options=options) self.assertIsNone(num_profiler.histogram_selection) self.assertEqual(["sturges", "doane"], num_profiler.histogram_bin_method_names) # test histogram bin count set options.histogram_and_quantiles.bin_count_or_method = 100 num_profiler = FloatColumn(name="test3", options=options) self.assertIsNone(num_profiler.histogram_selection) self.assertEqual(['custom'], num_profiler.histogram_bin_method_names) # case when just 1 unique value, should just set bin size to be 1 num_profiler.update(pd.Series(['1', '1'])) self.assertEqual( 1, len(num_profiler.histogram_methods['custom']['histogram'] ['bin_counts'])) # case when more than 1 unique value, by virtue of a streaming update num_profiler.update(pd.Series(['2'])) self.assertEqual( 100, len(num_profiler._stored_histogram['histogram']['bin_counts'])) histogram, _ = num_profiler._histogram_for_profile('custom') self.assertEqual(100, len(histogram['bin_counts']))
def test_option_precision(self): data = [1.1, 2.2, 3.3, 4.4] df = pd.Series(data).apply(str) # Turn off precision options = FloatOptions() options.set({"precision.is_enabled": False}) profiler = FloatColumn(df.name, options=options) profiler.update(df) self.assertEqual(None, profiler.precision['sample_size']) # Turn on precision, check sample_size options = FloatOptions() options.set({"precision.is_enabled": True}) profiler = FloatColumn(df.name, options=options) profiler.update(df) self.assertEqual(4, profiler.precision['sample_size']) # Trun on precision, set 0.5 sample_size options = FloatOptions() options.set({"precision.sample_ratio": 0.5}) profiler = FloatColumn(df.name, options=options) profiler.update(df) self.assertEqual(2, profiler.precision['sample_size'])