def test_merge_histogram(self): data = pd.Series([], dtype=object) profiler = FloatColumn(data.name) profiler.update(data) profiler._stored_histogram['histogram']['bin_counts'] = np.array( [3, 2]) profiler._stored_histogram['histogram']['bin_edges'] = \ np.array([1.0, 3.0, 5.0]) input_array = [0.5, 1.0, 2.0, 5.0] profiler._merge_histogram(input_array) merged_hist = profiler._histogram_for_profile('sqrt')[0] expected_bin_counts, expected_bin_edges = \ [5, 2, 2], [0.5, 2.0, 3.5, 5.0] self.assertEqual(expected_bin_counts, merged_hist['bin_counts'].tolist()) self.assertCountEqual(expected_bin_edges, merged_hist['bin_edges'])
def test_histogram_option_integration(self): # test setting bin methods options = FloatOptions() options.histogram_and_quantiles.bin_count_or_method = "sturges" num_profiler = FloatColumn(name="test", options=options) self.assertIsNone(num_profiler.histogram_selection) self.assertEqual(["sturges"], num_profiler.histogram_bin_method_names) options.histogram_and_quantiles.bin_count_or_method = [ "sturges", "doane" ] num_profiler = FloatColumn(name="test2", options=options) self.assertIsNone(num_profiler.histogram_selection) self.assertEqual(["sturges", "doane"], num_profiler.histogram_bin_method_names) # test histogram bin count set options.histogram_and_quantiles.bin_count_or_method = 100 num_profiler = FloatColumn(name="test3", options=options) self.assertIsNone(num_profiler.histogram_selection) self.assertEqual(['custom'], num_profiler.histogram_bin_method_names) # case when just 1 unique value, should just set bin size to be 1 num_profiler.update(pd.Series(['1', '1'])) self.assertEqual( 1, len(num_profiler.histogram_methods['custom']['histogram'] ['bin_counts'])) # case when more than 1 unique value, by virtue of a streaming update num_profiler.update(pd.Series(['2'])) self.assertEqual( 100, len(num_profiler._stored_histogram['histogram']['bin_counts'])) histogram, _ = num_profiler._histogram_for_profile('custom') self.assertEqual(100, len(histogram['bin_counts']))