Example #1
0
    def test_merge_histogram(self):
        data = pd.Series([], dtype=object)
        profiler = FloatColumn(data.name)
        profiler.update(data)
        profiler._stored_histogram['histogram']['bin_counts'] = np.array(
            [3, 2])
        profiler._stored_histogram['histogram']['bin_edges'] = \
            np.array([1.0, 3.0, 5.0])
        input_array = [0.5, 1.0, 2.0, 5.0]

        profiler._merge_histogram(input_array)
        merged_hist = profiler._histogram_for_profile('sqrt')[0]

        expected_bin_counts, expected_bin_edges = \
            [5, 2, 2], [0.5, 2.0, 3.5, 5.0]
        self.assertEqual(expected_bin_counts,
                         merged_hist['bin_counts'].tolist())
        self.assertCountEqual(expected_bin_edges, merged_hist['bin_edges'])
Example #2
0
    def test_histogram_option_integration(self):
        # test setting bin methods
        options = FloatOptions()
        options.histogram_and_quantiles.bin_count_or_method = "sturges"
        num_profiler = FloatColumn(name="test", options=options)
        self.assertIsNone(num_profiler.histogram_selection)
        self.assertEqual(["sturges"], num_profiler.histogram_bin_method_names)

        options.histogram_and_quantiles.bin_count_or_method = [
            "sturges", "doane"
        ]
        num_profiler = FloatColumn(name="test2", options=options)
        self.assertIsNone(num_profiler.histogram_selection)
        self.assertEqual(["sturges", "doane"],
                         num_profiler.histogram_bin_method_names)

        # test histogram bin count set
        options.histogram_and_quantiles.bin_count_or_method = 100
        num_profiler = FloatColumn(name="test3", options=options)
        self.assertIsNone(num_profiler.histogram_selection)
        self.assertEqual(['custom'], num_profiler.histogram_bin_method_names)

        # case when just 1 unique value, should just set bin size to be 1
        num_profiler.update(pd.Series(['1', '1']))
        self.assertEqual(
            1,
            len(num_profiler.histogram_methods['custom']['histogram']
                ['bin_counts']))

        # case when more than 1 unique value, by virtue of a streaming update
        num_profiler.update(pd.Series(['2']))
        self.assertEqual(
            100,
            len(num_profiler._stored_histogram['histogram']['bin_counts']))

        histogram, _ = num_profiler._histogram_for_profile('custom')
        self.assertEqual(100, len(histogram['bin_counts']))