Ejemplo n.º 1
0
    def test_profiled_quantiles(self):
        """
        Checks the quantiles of profiled numerical columns.
        :return:
        """

        # this data has 4 bins, range of 3
        # with equal bin size, each bin has the width of 0.75

        data = ["1.0", "2.0", "3.0", "4.0"]
        df = pd.Series(data)
        profiler = FloatColumn(df.name)
        profiler.update(df)
        profile = profiler.profile

        est_quantiles = profile['quantiles']
        est_Q1 = est_quantiles[249]
        est_Q2 = est_quantiles[499]
        est_Q3 = est_quantiles[749]

        self.assertEqual(999, len(est_quantiles))
        self.assertAlmostEqual(1.000012, est_quantiles[0])
        self.assertEqual(est_Q1, 1.003)
        self.assertEqual(est_Q2, 2.5)
        self.assertEqual(est_Q3, 3.001)
        self.assertAlmostEqual(3.999988, est_quantiles[-1])
Ejemplo n.º 2
0
    def test_profiled_quantiles(self):
        """
        Checks the quantiles of profiled numerical columns.
        :return:
        """

        # this data has 4 bins, range of 3
        # with equal bin size, each bin has the width of 0.75

        data = ["1.0", "2.0", "3.0", "4.0"]
        df = pd.Series(data)
        profiler = FloatColumn(df.name)
        profiler.update(df)
        profile = profiler.profile

        est_quartiles = profile['quantiles']
        est_Q1 = est_quartiles[249]
        est_Q2 = est_quartiles[499]
        est_Q3 = est_quartiles[749]

        data_to_num = [float(item) for item in data]
        exact_Q1 = np.percentile(data_to_num, 25)
        exact_Q2 = np.percentile(data_to_num, 50)
        exact_Q3 = np.percentile(data_to_num, 75)

        self.assertEqual(est_Q1, exact_Q1)
        self.assertEqual(est_Q2, exact_Q2)
        self.assertEqual(est_Q3, exact_Q3)
Ejemplo n.º 3
0
    def test_profile_histogram_w_updates(self):
        """
        Checks if histogram properly resets the _profiled histogram after
        merge or update.
        :return:
        """
        list_data_test = []
        # this data has 4 bins, range of 3
        # with equal bin size, each bin has the width of 0.75
        df1 = pd.Series(["1.0", "2.0", "3.0", "4.0"])
        expected_histogram1 = {
            'bin_counts': np.array([1, 1, 1, 1]),
            'bin_edges': np.array([1.0, 1.75, 2.5, 3.25, 4.0]),
        }
        list_data_test.append([df1, expected_histogram1])

        # this data will be the second update of the profile.
        # this results in the combination of the previous data and this data.
        # the range should update to 12 from 3.
        df2 = pd.Series(["1.0", "5.0", "8.0", "13.0"])
        expected_histogram2 = {
            'bin_counts': np.array([4, 1, 1, 1, 0, 1]),
            'bin_edges': np.array([1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0]),
        }
        list_data_test.append([df2, expected_histogram2])

        profiler = FloatColumn("test")
        for i, (df, expected_histogram) in enumerate(list_data_test):
            profiler.update(df)
            self.assertIsNone(profiler.histogram_selection)

            profile = profiler.profile
            self.assertIsNotNone(profiler.histogram_selection)
            histogram = profile['histogram']

            self.assertEqual(expected_histogram['bin_counts'].tolist(),
                             histogram['bin_counts'].tolist())
            self.assertCountEqual(
                np.round(expected_histogram['bin_edges'], 12),
                np.round(histogram['bin_edges'], 12))

        # apply test to merging profiles
        expected_histogram = {
            'bin_edges':
            np.array([1., 19 / 7, 31 / 7, 43 / 7, 55 / 7, 67 / 7, 79 / 7,
                      13.]),
            'bin_counts':
            np.array([6, 4, 2, 0, 2, 0, 2])
        }
        merged_profiler = profiler + profiler
        self.assertIsNone(merged_profiler.histogram_selection)

        profile = merged_profiler.profile
        self.assertIsNotNone(merged_profiler.histogram_selection)
        histogram = profile['histogram']
        self.assertEqual(expected_histogram['bin_counts'].tolist(),
                         histogram['bin_counts'].tolist())
        self.assertCountEqual(np.round(expected_histogram['bin_edges'], 12),
                              np.round(histogram['bin_edges'], 12))
Ejemplo n.º 4
0
    def test_custom_bin_count_merge(self):

        options = FloatOptions()
        options.histogram_and_quantiles.bin_count_or_method = 10

        data = [2.0, 'not a float', 6.0, 'not a float']
        df = pd.Series(data).apply(str)
        profiler1 = FloatColumn("Float", options)
        profiler1.update(df)

        data2 = [10.0, 'not a float', 15.0, 'not a float']
        df2 = pd.Series(data2).apply(str)
        profiler2 = FloatColumn("Float", options)
        profiler2.update(df2)

        # no warning should occur
        with warnings.catch_warnings(record=True) as w:
            merge_profile = profiler1 + profiler2
        self.assertListEqual([], w)
        self.assertEqual(10, merge_profile.user_set_histogram_bin)

        # make bin counts different and get warning
        profiler2.user_set_histogram_bin = 120
        with self.assertWarnsRegex(
                UserWarning, 'User set histogram bin counts did not '
                'match. Choosing the larger bin count.'):
            merged_profile = profiler1 + profiler2
        self.assertEqual(120, merged_profile.user_set_histogram_bin)
Ejemplo n.º 5
0
    def test_base_case(self):
        data = pd.Series([], dtype=object)
        profiler = FloatColumn(data.name)
        profiler.update(data)

        self.assertEqual(profiler.match_count, 0)
        self.assertEqual(profiler.min, None)
        self.assertEqual(profiler.max, None)
        self.assertEqual(profiler.mean, 0)
        self.assertEqual(profiler.variance, 0)
        self.assertTrue(profiler.stddev is np.nan)
        self.assertIsNone(profiler.histogram_selection)
        self.assertEqual(len(profiler.quantiles), 1000)
        self.assertIsNone(profiler.data_type_ratio)
Ejemplo n.º 6
0
    def test_null_values_for_histogram(self):
        data = pd.Series(['-inf', 'inf'])
        profiler = FloatColumn(data.name)
        profiler.update(data)

        profile = profiler.profile
        histogram = profile['histogram']

        self.assertEqual(histogram['bin_counts'], None)
        self.assertEqual(histogram['bin_edges'], None)

        data = pd.Series(['-2', '-1', '1', '2', '-inf', 'inf'])
        profiler = FloatColumn(data.name)
        profiler.update(data)

        profile = profiler.profile
        histogram = profile['histogram']

        expected_histogram = {
            'bin_counts': np.array([1, 1, 0, 2]),
            'bin_edges': np.array([-2., -1., 0., 1., 2.]),
        }

        self.assertCountEqual(histogram['bin_counts'],
                              expected_histogram['bin_counts'])
        self.assertCountEqual(histogram['bin_edges'],
                              expected_histogram['bin_edges'])
Ejemplo n.º 7
0
    def test_histogram_loss(self):
        # run time is small
        diff_var, avg_diffvar, total_var, avg_totalvar, run_time, avg_runtime =\
            0.3, 0.2, 0.1, 0.05, 0.0014, 0.0022
        expected_loss = 0.1 / 0.2 + 0.05 / 0.05
        est_loss = FloatColumn._histogram_loss(diff_var, avg_diffvar,
                                               total_var, avg_totalvar,
                                               run_time, avg_runtime)
        self.assertEqual(expected_loss, est_loss)

        # run time is big
        diff_var, avg_diffvar, total_var, avg_totalvar, run_time, avg_runtime =\
            0.3, 0.2, 0.1, 0.05, 22, 14
        expected_loss = 0.1 / 0.2 + 0.05 / 0.05 + 8 / 14
        est_loss = FloatColumn._histogram_loss(diff_var, avg_diffvar,
                                               total_var, avg_totalvar,
                                               run_time, avg_runtime)
        self.assertEqual(expected_loss, est_loss)
Ejemplo n.º 8
0
    def test_profile_merge_no_bin_overlap(self):

        data = [2.0, 'not a float', 6.0, 'not a float']
        df = pd.Series(data).apply(str)
        profiler1 = FloatColumn("Float")
        profiler1.update(df)

        data2 = [10.0, 'not a float', 15.0, 'not a float']
        df2 = pd.Series(data2).apply(str)
        profiler2 = FloatColumn("Float")
        profiler2.update(df2)

        # set bin names so no overlap
        profiler1.histogram_bin_method_names = ['No overlap 1']
        profiler2.histogram_bin_method_names = ['No overlap 2']

        with self.assertRaisesRegex(
                ValueError, 'Profiles have no overlapping bin methods '
                'and therefore cannot be added together.'):
            profiler1 + profiler2
Ejemplo n.º 9
0
    def test_option_timing(self):
        data = [2.0, 12.5, 'not a float', 6.0, 'not a float']
        df = pd.Series(data).apply(str)

        options = FloatOptions()
        options.set({"min.is_enabled": False})

        profiler = FloatColumn(df.name, options=options)

        time_array = [float(i) for i in range(100, 0, -1)]
        with mock.patch('time.time', side_effect=lambda: time_array.pop()):
            # Validate that the times dictionary is empty
            self.assertEqual(defaultdict(float), profiler.profile['times'])
            profiler.update(df)

            # Validate the time in the datetime class has the expected time.
            profile = profiler.profile

            expected = defaultdict(float, {'max': 1.0, 'sum': 1.0,\
                                           'variance': 1.0, 'precision': 1.0,\
                                           'histogram_and_quantiles': 15.0})
            self.assertEqual(expected, profile['times'])

            # Validate time in datetime class has expected time after second update
            profiler.update(df)
            expected = defaultdict(float, {'max': 2.0, 'sum': 2.0,\
                                           'variance': 2.0, 'precision': 2.0,\
                                           'histogram_and_quantiles': 30.0})
            self.assertEqual(expected, profiler.profile['times'])
Ejemplo n.º 10
0
    def test_merge_histogram(self):
        data = pd.Series([], dtype=object)
        profiler = FloatColumn(data.name)
        profiler.update(data)
        profiler._stored_histogram['histogram']['bin_counts'] = np.array(
            [3, 2])
        profiler._stored_histogram['histogram']['bin_edges'] = \
            np.array([1.0, 3.0, 5.0])
        input_array = [0.5, 1.0, 2.0, 5.0]

        profiler._merge_histogram(input_array)
        merged_hist = profiler._histogram_for_profile('sqrt')[0]

        expected_bin_counts, expected_bin_edges = \
            [5, 2, 2], [0.5, 2.0, 3.5, 5.0]
        self.assertEqual(expected_bin_counts,
                         merged_hist['bin_counts'].tolist())
        self.assertCountEqual(expected_bin_edges, merged_hist['bin_edges'])
Ejemplo n.º 11
0
 def test_histogram_to_array(self):
     data = pd.Series([], dtype=object)
     profiler = FloatColumn(data.name)
     profiler.update(data)
     profiler._stored_histogram['histogram']['bin_counts'] = \
         np.array([3, 2, 1])
     profiler._stored_histogram['histogram']['bin_edges'] = \
         np.array([1.0, 3.0, 5.0, 7.0])
     array_from_histogram = profiler._histogram_to_array()
     expected_array = [1.0, 1.0, 1.0, 3.0, 3.0, 7.0]
     self.assertEqual(expected_array, array_from_histogram.tolist())
Ejemplo n.º 12
0
 def test_histogram_to_array(self):
     data = pd.Series([], dtype=object)
     profiler = FloatColumn(data.name)
     profiler.update(data)
     profiler.histogram_methods['auto']['histogram']['bin_counts'] = \
         np.array([3, 2, 1])
     profiler.histogram_methods['auto']['histogram']['bin_edges'] = \
         np.array([1.0, 3.0, 5.0, 7.0])
     array_from_histogram = profiler._histogram_to_array('auto')
     expected_array = [1.0, 1.0, 1.0, 3.0, 3.0, 7.0]
     self.assertCountEqual(array_from_histogram, expected_array)
Ejemplo n.º 13
0
 def test_total_histogram_bin_variance(self):
     data = pd.Series([], dtype=object)
     profiler = FloatColumn(data.name)
     profiler.update(data)
     profiler._stored_histogram['histogram']['bin_counts'] = \
         np.array([3, 2, 1])
     profiler._stored_histogram['histogram']['bin_edges'] = \
         np.array([1.0, 3.0, 5.0, 7.0])
     input_array = np.array([1.1, 1.5, 2.3, 3.5, 4.0, 6.5])
     expected_total_var = np.array([1.1, 1.5, 2.3]).var() \
                          + np.array([3.5, 4.0]).var() \
                          + np.array([6.5]).var()
     est_total_var = profiler._total_histogram_bin_variance(input_array)
     self.assertEqual(expected_total_var, est_total_var)
Ejemplo n.º 14
0
    def test_data_type_ratio(self):
        data = np.linspace(-5, 5, 4)
        df = pd.Series(data).apply(str)

        profiler = FloatColumn(df.name)
        profiler.update(df)
        self.assertEqual(profiler.data_type_ratio, 1.0)

        df = pd.Series(['not a float'])
        profiler.update(df)
        self.assertEqual(profiler.data_type_ratio, 0.8)
Ejemplo n.º 15
0
 def test_estimate_stats_from_histogram(self):
     data = pd.Series([], dtype=object)
     profiler = FloatColumn(data.name)
     profiler.update(data)
     profiler._stored_histogram['histogram']['bin_counts'] = \
         np.array([1, 2, 1])
     profiler._stored_histogram['histogram']['bin_edges'] = \
         np.array([1.0, 3.0, 5.0, 7.0])
     expected_mean = (2.0 * 1 + 4.0 * 2 + 6.0 * 1) / 4
     expected_var = (1 * (2.0 - expected_mean)**2 + 2 *
                     (4.0 - expected_mean)**2 + 1 *
                     (6.0 - expected_mean)**2) / 4
     expected_std = np.sqrt(expected_var)
     est_var = profiler._estimate_stats_from_histogram()
     self.assertEqual(expected_var, est_var)
Ejemplo n.º 16
0
    def test_merge_histogram(self):
        data = pd.Series([], dtype=object)
        profiler = FloatColumn(data.name)
        profiler.update(data)
        profiler.histogram_methods['sqrt']['histogram']['bin_counts'] = \
            np.array([3, 2])
        profiler.histogram_methods['sqrt']['histogram']['bin_edges'] = \
            np.array([1.0, 3.0, 5.0])
        input_array = [0.5, 1.0, 2.0, 5.0]

        profiler._merge_histogram(input_array, 'sqrt')
        merged_bin_counts = \
            profiler.histogram_methods['sqrt']['histogram']['bin_counts']
        merged_bin_edges = \
            profiler.histogram_methods['sqrt']['histogram']['bin_edges']
        expected_bin_counts, expected_bin_edges = \
            [5, 2, 2], [0.5, 2.0, 3.5, 5.0]
        self.assertCountEqual(merged_bin_counts, expected_bin_counts)
        self.assertCountEqual(merged_bin_edges, expected_bin_edges)
Ejemplo n.º 17
0
    def test_profile_merge_with_different_options(self):
        # Creating first profiler with default options
        options = FloatOptions()
        options.max.is_enabled = False
        options.min.is_enabled = False

        data = [2, 4, 6, 8]
        df = pd.Series(data).apply(str)
        profiler1 = FloatColumn("Float", options=options)
        profiler1.update(df)

        # Creating second profiler with separate options
        options = FloatOptions()
        options.min.is_enabled = False
        options.precision.is_enabled = False
        data2 = [10, 15]
        df2 = pd.Series(data2).apply(str)
        profiler2 = FloatColumn("Float", options=options)
        profiler2.update(df2)

        # Asserting warning when adding 2 profilers with different options
        with warnings.catch_warnings(record=True) as w:
            profiler3 = profiler1 + profiler2
            list_of_warning_messages = []
            for warning in w:
                list_of_warning_messages.append(str(warning.message))

            warning1 = "precision is disabled because it is not enabled in both" \
                       " profiles."
            warning2 = "max is disabled because it is not enabled in both " \
                       "profiles."
            self.assertIn(warning1, list_of_warning_messages)
            self.assertIn(warning2, list_of_warning_messages)

        # Assert that these features are still merged
        self.assertEqual("rice", profiler3.histogram_selection)
        self.assertEqual(21.5, profiler3.variance)
        self.assertEqual(45.0, profiler3.sum)

        # Assert that these features are not calculated
        self.assertIsNone(profiler3.max)
        self.assertIsNone(profiler3.min)
        self.assertEqual(0, profiler3.precision)
Ejemplo n.º 18
0
    def test_profile_merge(self):
        data = [2.0, 'not a float', 6.0, 'not a float']
        df = pd.Series(data).apply(str)
        profiler1 = FloatColumn("Float")
        profiler1.update(df)

        data2 = [10.0, 'not a float', 15.0, 'not a float']
        df2 = pd.Series(data2).apply(str)
        profiler2 = FloatColumn("Float")
        profiler2.update(df2)

        expected_profile = dict(
            min=2.0,
            max=15.0,
            mean=8.25,
            variance=30.916666666666668,
            stddev=np.sqrt(30.916),
            histogram={
                'bin_counts': np.array([1, 1, 1, 1]),
                'bin_edges': np.array([2., 5.25, 8.5, 11.75, 15.])
            },
        )

        profiler3 = profiler1 + profiler2

        expected_histogram = expected_profile.pop('histogram')
        profile3 = profiler3.profile
        histogram = profile3.pop('histogram')

        self.assertAlmostEqual(profiler3.stddev,
                               expected_profile.pop('stddev'),
                               places=3)
        self.assertAlmostEqual(profiler3.variance,
                               expected_profile.pop('variance'),
                               places=3)
        self.assertEqual(profiler3.mean, expected_profile.pop('mean'))
        self.assertEqual(profiler3.histogram_selection, 'rice')
        self.assertEqual(profiler3.min, expected_profile.pop('min'))
        self.assertEqual(profiler3.max, expected_profile.pop('max'))
        self.assertCountEqual(histogram['bin_counts'],
                              expected_histogram['bin_counts'])
        self.assertCountEqual(histogram['bin_edges'],
                              expected_histogram['bin_edges'])
Ejemplo n.º 19
0
    def test_single_data_variance_case(self):
        data = pd.Series([1.5]).apply(str)
        profiler = FloatColumn(data.name)
        profiler.update(data)
        self.assertEqual(profiler.match_count, 1.0)
        self.assertEqual(profiler.mean, 1.5)
        self.assertEqual(profiler.variance, 0.0)

        data = pd.Series([2.5]).apply(str)
        profiler.update(data)
        self.assertEqual(profiler.match_count, 2)
        self.assertEqual(profiler.mean, 2.0)
        self.assertEqual(profiler.variance, 0.5)
Ejemplo n.º 20
0
    def test_select_method_for_histogram(self):
        data = pd.Series([], dtype=object)
        profiler = FloatColumn(data.name)
        profiler.update(data)
        list_method = [
            'auto', 'fd', 'doane', 'scott', 'rice', 'sturges', 'sqrt'
        ]
        current_exact_var = 0
        # sqrt has the least current loss
        current_est_var = np.array([0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.005])
        current_total_var = np.array(
            [0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01])
        current_run_time = np.array([0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01])
        # all methods have the same total loss
        list_total_loss = np.array([0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1])
        for i, method in enumerate(list_method):
            profiler.histogram_methods[method]['total_loss'] = \
                list_total_loss[i]
        selected_method = profiler._select_method_for_histogram(
            current_exact_var, current_est_var, current_total_var,
            current_run_time)
        self.assertEqual(selected_method, 'sqrt')

        # another test
        current_exact_var = 0

        # sqrt has the least current loss
        current_est_var = np.array([0.03, 0.03, 0.03, 0.03, 0.03, 0.03, 0.029])
        current_total_var = np.array(
            [0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01])
        current_run_time = np.array([0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01])

        # but sturges has the least total loss
        list_total_loss = np.array([0.1, 0.1, 0.1, 0.1, 0.1, 0.05, 0.1])
        for i, method in enumerate(list_method):
            profiler.histogram_methods[method]['total_loss'] = \
                list_total_loss[i]
        selected_method = profiler._select_method_for_histogram(
            current_exact_var, current_est_var, current_total_var,
            current_run_time)
        self.assertEqual(selected_method, 'sturges')
Ejemplo n.º 21
0
    def test_profiled_min(self):
        # test with multiple values
        data = np.linspace(-5, 5, 11)
        df = pd.Series(data).apply(str)

        profiler = FloatColumn(df.name)
        profiler.update(df[1:])
        self.assertEqual(profiler.min, -4)

        profiler.update(df)
        self.assertEqual(profiler.min, -5)

        profiler.update(pd.Series(['-4']))
        self.assertEqual(profiler.min, -5)

        # empty data
        data = pd.Series([], dtype=object)
        profiler = FloatColumn(data.name)
        profiler.update(data)
        self.assertEqual(profiler.min, None)

        # data with None value
        df = pd.Series([2.0, 3.0, None, np.nan]).apply(str)
        profiler = FloatColumn(df.name)
        profiler.update(df)
        self.assertEqual(profiler.min, 2.0)

        # data with one value
        df = pd.Series([2.0]).apply(str)
        profiler = FloatColumn(df.name)
        profiler.update(df)
        self.assertEqual(profiler.min, 2.0)

        # data with unique value
        df = pd.Series([2.0, 2.0, 2.0, 2.0, 2.0]).apply(str)
        profiler = FloatColumn(df.name)
        profiler.update(df)
        self.assertEqual(profiler.min, 2.0)

        # data with unique value as zero
        df = pd.Series([0.0, 0.0, 0.0, 0.0, 0.0]).apply(str)
        profiler = FloatColumn(df.name)
        profiler.update(df)
        self.assertEqual(profiler.min, 0.0)
Ejemplo n.º 22
0
    def test_profile_merge_edge_case(self):
        data = [2.0, 'not a float', 6.0, 'not a float']
        df = pd.Series(data).apply(str)
        profiler1 = FloatColumn("Float")
        profiler1.update(df)
        profiler1.match_count = 0

        data2 = [10.0, 'not a float', 15.0, 'not a float']
        df2 = pd.Series(data2).apply(str)
        profiler2 = FloatColumn("Float")
        profiler2.update(df2)

        profiler3 = profiler1 + profiler2
        self.assertEqual(profiler3.stddev, profiler2.stddev)

        # test merge with empty data
        df1 = pd.Series([], dtype=object)
        profiler1 = FloatColumn("Float")
        profiler1.update(df1)

        df2 = pd.Series([], dtype=object)
        profiler2 = FloatColumn("Float")
        profiler2.update(df2)

        profiler = profiler1 + profiler2
        self.assertEqual(profiler.min, None)
        self.assertEqual(profiler.max, None)

        df3 = pd.Series([2.0, 3.0]).apply(str)
        profiler3 = FloatColumn("Float")
        profiler3.update(df3)

        profiler = profiler1 + profiler3
        self.assertEqual(profiler.min, 2.0)
        self.assertEqual(profiler.max, 3.0)

        df4 = pd.Series([4.0, 5.0]).apply(str)
        profiler4 = FloatColumn("Float")
        profiler4.update(df4)

        profiler = profiler3 + profiler4
        self.assertEqual(profiler.min, 2.0)
        self.assertEqual(profiler.max, 5.0)
Ejemplo n.º 23
0
    def test_option_precision(self):
        data = [1.1, 2.2, 3.3, 4.4]
        df = pd.Series(data).apply(str)

        # Turn off precision
        options = FloatOptions()
        options.set({"precision.is_enabled": False})
        profiler = FloatColumn(df.name, options=options)
        profiler.update(df)
        self.assertEqual(None, profiler.precision['sample_size'])

        # Turn on precision, check sample_size
        options = FloatOptions()
        options.set({"precision.is_enabled": True})
        profiler = FloatColumn(df.name, options=options)
        profiler.update(df)
        self.assertEqual(4, profiler.precision['sample_size'])

        # Trun on precision, set 0.5 sample_size
        options = FloatOptions()
        options.set({"precision.sample_ratio": 0.5})
        profiler = FloatColumn(df.name, options=options)
        profiler.update(df)
        self.assertEqual(2, profiler.precision['sample_size'])
Ejemplo n.º 24
0
    def test_profiled_max(self):
        data = np.linspace(-5, 5, 11)
        df = pd.Series(data).apply(str)

        profiler = FloatColumn(df.name)
        profiler.update(df[:-1])
        self.assertEqual(profiler.max, 4)

        profiler.update(df)
        self.assertEqual(profiler.max, 5)

        profiler.update(pd.Series(['4']))
        self.assertEqual(profiler.max, 5)

        # empty data
        data = pd.Series([], dtype=object)
        profiler = FloatColumn(data.name)
        profiler.update(data)
        self.assertEqual(profiler.max, None)

        # data with None value
        df = pd.Series([2.0, 3.0, None, np.nan]).apply(str)
        profiler = FloatColumn(df.name)
        profiler.update(df)
        self.assertEqual(profiler.max, 3.0)

        # data with one value
        df = pd.Series([2.0]).apply(str)
        profiler = FloatColumn(df.name)
        profiler.update(df)
        self.assertEqual(profiler.max, 2.0)

        # data with unique value
        df = pd.Series([2.0, 2.0, 2.0, 2.0, 2.0]).apply(str)
        profiler = FloatColumn(df.name)
        profiler.update(df)
        self.assertEqual(profiler.max, 2.0)

        # data with unique value as zero
        df = pd.Series([0.0, 0.0, 0.0, 0.0, 0.0]).apply(str)
        profiler = FloatColumn(df.name)
        profiler.update(df)
        self.assertEqual(profiler.max, 0.0)
Ejemplo n.º 25
0
    def test_profile(self):
        data = [2.5, 12.5, 'not a float', 5, 'not a float']
        df = pd.Series(data).apply(str)

        profiler = FloatColumn(df.name)

        expected_profile = dict(
            min=2.5,
            max=12.5,
            mean=20/3.0,
            median=None,
            variance=27 + 1/12.0,
            stddev=np.sqrt(27+1/12.0),
            histogram={
                'bin_counts': np.array([1, 1, 0, 1]),
                'bin_edges': np.array([2.5, 5.0, 7.5, 10.0, 12.5]),
            },
            quantiles={
                0: 3.75,
                1: 5.0,
                2: 10.0
            },
            times=defaultdict(float, {'histogram_and_quantiles': 15.0,\
                                      'precision': 1.0, 'max': 1.0, 'min': 1.0,\
                                      'sum': 1.0, 'variance': 1.0}),
            precision=1.0
        )
        time_array = [float(i) for i in range(100, 0, -1)]
        with mock.patch('time.time', side_effect=lambda: time_array.pop()):
            # Validate that the times dictionary is empty
            self.assertEqual(defaultdict(float), profiler.profile['times'])
            profiler.update(df)

            profile = profiler.profile
            # pop out the histogram to test separately from the rest of the dict
            # as we need comparison with some precision
            histogram = profile.pop('histogram')
            expected_histogram = expected_profile.pop('histogram')
            quantiles = profile.pop('quantiles')
            expected_quantiles = expected_profile.pop('quantiles')
            actual_quartiles = {
                0: quantiles[249],
                1: quantiles[499],
                2: quantiles[749]
            }

            self.assertDictEqual(expected_profile, profile)
            self.assertEqual(expected_profile['precision'], 1.0)
            self.assertCountEqual(expected_histogram['bin_counts'],
                                  histogram['bin_counts'])
            self.assertCountEqual(
                np.round(expected_histogram['bin_edges'], 12),
                np.round(histogram['bin_edges'], 12))

            self.assertDictEqual(actual_quartiles, expected_quantiles)

            # Validate time in datetime class has expected time after second update
            profiler.update(df)
            expected = defaultdict(float, {'min': 2.0, 'max': 2.0, 'sum': 2.0,\
                                           'variance': 2.0, 'precision': 2.0,\
                                           'histogram_and_quantiles': 30.0})
            self.assertEqual(expected, profiler.profile['times'])
Ejemplo n.º 26
0
    def test_profiled_mean_and_variance(self):
        """
        Checks the mean and variance of profiled numerical columns.
        :return:
        """
        def mean(df):
            total = 0
            for item in df:
                total += item
            return total / len(df)

        def var(df):
            var = 0
            mean_df = mean(df)
            for item in df:
                var += (item - mean_df)**2
            return var / (len(df) - 1)

        def batch_variance(mean_a, var_a, count_a, mean_b, var_b, count_b):
            delta = mean_b - mean_a
            m_a = var_a * (count_a - 1)
            m_b = var_b * (count_b - 1)
            M2 = m_a + m_b + delta**2 * count_a * count_b / (count_a + count_b)
            return M2 / (count_a + count_b - 1)

        data = np.linspace(-5, 5, 11).tolist()
        df1 = pd.Series(data)

        data = np.linspace(-3, 2, 11).tolist()
        df2 = pd.Series(data)

        data = np.full((10, ), 1)
        df3 = pd.Series(data)

        num_profiler = FloatColumn(df1.name)
        num_profiler.update(df1.apply(str))

        self.assertEqual(mean(df1), num_profiler.mean)
        self.assertEqual(var(df1), num_profiler.variance)
        self.assertEqual(np.sqrt(var(df1)), num_profiler.stddev)

        variance = batch_variance(mean_a=num_profiler.mean,
                                  var_a=num_profiler.variance,
                                  count_a=num_profiler.match_count,
                                  mean_b=mean(df2),
                                  var_b=var(df2),
                                  count_b=df2.count())
        num_profiler.update(df2.apply(str))
        df = pd.concat([df1, df2])
        self.assertEqual(mean(df), num_profiler.mean)
        self.assertEqual(variance, num_profiler.variance)
        self.assertEqual(np.sqrt(variance), num_profiler.stddev)

        variance = batch_variance(mean_a=num_profiler.mean,
                                  var_a=num_profiler.variance,
                                  count_a=num_profiler.match_count,
                                  mean_b=mean(df3),
                                  var_b=var(df3),
                                  count_b=df3.count())
        num_profiler.update(df3.apply(str))

        df = pd.concat([df1, df2, df3])
        self.assertEqual(mean(df), num_profiler.mean)
        self.assertEqual(variance, num_profiler.variance)
        self.assertEqual(np.sqrt(variance), num_profiler.stddev)
Ejemplo n.º 27
0
    def test_profiled_histogram(self):
        """
        Checks the histogram of profiled numerical columns.
        :return:
        """

        list_data_test = []
        # this data has 4 bins, range of 3
        # with equal bin size, each bin has the width of 0.75
        df1 = pd.Series(["1.0", "2.0", "3.0", "4.0"])
        expected_histogram1 = {
            'bin_counts': np.array([1, 1, 1, 1]),
            'bin_edges': np.array([1.0, 1.75, 2.5, 3.25, 4.0]),
        }
        list_data_test.append([df1, expected_histogram1])

        # this data has 4 bins, range of 12
        # with equal bin size, each bin has the width of 3.0
        df2 = pd.Series(["1.0", "5.0", "8.0", "13.0"])
        expected_histogram2 = {
            'bin_counts': np.array([1, 1, 1, 1]),
            'bin_edges': np.array([1.0, 4.0, 7.0, 10.0, 13.0]),
        }
        list_data_test.append([df2, expected_histogram2])

        # this data has 3 bins, range of 3
        # with equal bin size, each bin has the width of 1
        df3 = pd.Series(["1.0", "1.0", "3.0", "4.0"])
        expected_histogram3 = {
            'bin_counts': np.array([2, 0, 2]),
            'bin_edges': np.array([1.0, 2.0, 3.0, 4.0]),
        }
        list_data_test.append([df3, expected_histogram3])

        # this data has only one unique value, not overflow
        df4 = pd.Series([-10.0, -10.0, -10.0]).apply(str)
        expected_histogram4 = {
            'bin_counts': np.array([3]),
            'bin_edges': np.array([-10.0, -10.0]),
        }
        list_data_test.append([df4, expected_histogram4])

        # this data has only one unique value, overflow
        df5 = pd.Series([-10.0**20]).apply(str)
        expected_histogram5 = {
            'bin_counts': np.array([1]),
            'bin_edges': np.array([-10.0**20, -10.0**20]),
        }
        list_data_test.append([df5, expected_histogram5])

        for i, (df, expected_histogram) in enumerate(list_data_test):
            profiler = FloatColumn(df.name)
            profiler.update(df)

            profile = profiler.profile
            histogram = profile['histogram']

            self.assertCountEqual(expected_histogram['bin_counts'],
                                  histogram['bin_counts'])
            if i != 4:
                self.assertCountEqual(
                    np.round(expected_histogram['bin_edges'], 12),
                    np.round(histogram['bin_edges'], 12))
            else:  # for overflow, dont use np.round
                self.assertCountEqual(expected_histogram['bin_edges'],
                                      histogram['bin_edges'])
Ejemplo n.º 28
0
    def test_profiled_precision(self):
        """
        Checks whether the precision for the profiler is correct.
        :return:
        """
        df_1 = pd.Series([0.4, 0.3, 0.1, 0.1, 0.1]).apply(str)
        df_2 = pd.Series([0.11, 0.11, 0.12, 2.11]).apply(str)
        df_3 = pd.Series([4.114, 3.161, 2.512, 2.131]).apply(str)
        df_mix = pd.Series([4.1, 3., 2.52, 2.13143]).apply(str)

        float_profiler = FloatColumn("Name")
        float_profiler.update(df_3)
        self.assertEqual(3, float_profiler.precision)

        float_profiler.update(df_2)
        self.assertEqual(2, float_profiler.precision)

        float_profiler.update(df_1)
        self.assertEqual(1, float_profiler.precision)

        float_profiler = FloatColumn("Name")
        float_profiler.update(df_mix)
        self.assertEqual(1, float_profiler.precision)
Ejemplo n.º 29
0
 def test_float_column_with_wrong_options(self):
     with self.assertRaisesRegex(
             ValueError, "FloatColumn parameter 'options' must be of"
             " type FloatOptions."):
         profiler = FloatColumn("Float", options="wrong_data_type")
Ejemplo n.º 30
0
    def test_histogram_with_varying_number_of_bin(self):
        """
        Checks the histogram with large number of bins
        """
        # this data use number of bins less than the max limit
        df1 = pd.Series([1, 2, 3, 4]).apply(str)
        profiler1 = FloatColumn(df1.name)
        profiler1.max_histogram_bin = 50
        profiler1.update(df1)
        num_bins = len(profiler1.profile['histogram']['bin_counts'])
        self.assertEqual(num_bins, 4)

        # this data uses large number of bins, which will be set to
        # the max limit
        df2 = pd.Series(
            [3.195103249264023e+18, 9999995.0, 9999999.0, 0.0,
             -10**10]).apply(str)
        profiler2 = FloatColumn(df2.name)
        profiler2.max_histogram_bin = 50
        profiler2.update(df2)
        num_bins = len(profiler2.profile['histogram']['bin_counts'])
        self.assertEqual(num_bins, 50)

        # max number of bin is increased to 10000
        profiler2 = FloatColumn(df2.name)
        profiler2.max_histogram_bin = 10000
        profiler2.update(df2)
        num_bins = len(profiler2.profile['histogram']['bin_counts'])
        self.assertEqual(num_bins, 10000)