Exemple #1
0
 def test_build_already_build(self, calculate_bins_func, add_hist_func):
     """Should not rebuild if Histogram was already build before"""
     hist = Histogram()
     hist.is_build = True
     hist.build()
     self.assertFalse(add_hist_func.called)
     self.assertFalse(calculate_bins_func.called)
Exemple #2
0
 def test_build(self):
     """Should calculate the bin list, and hist values for each column in the Histogram, if the
     histogram hasn't been build before"""
     hist = Histogram(bins=2)
     test_df = self.create_test_df()
     column_to_ad = test_df.select(F.col('value'))
     column_to_ad_2 = test_df.select(F.col('value2'))
     hist.add_column(column_to_ad)
     hist.add_column(column_to_ad_2)
     hist.build()
     self.assertEqual(3, len(hist.bin_list))
     self.assertEqual(2, len(hist.hist_dict))
     self.assertTrue(hist.is_build)
Exemple #3
0
 def test_add_hist_single_value(self):
     """Should set the bin list to n (self.nr_bins) bins (n+1 bin borders) where the min bin border is the
     single value -0.5 and the max bin border is the single value +0.5 incase a column is input with only a
     single value"""
     single_column_value = 1
     nr_bins = 5
     column_values = [single_column_value] * 100
     test_df = self.sqlCtx.createDataFrame(
         pd.DataFrame({'foo': column_values}))
     hist = Histogram(bins=nr_bins)
     hist.add_column(test_df.select(F.col('foo')))
     hist.build()
     self.assertEqual(6, len(hist.bin_boundaries))
     self.assertEqual(single_column_value - 0.5, min(hist.bin_boundaries))
     self.assertEqual(single_column_value + 0.5, max(hist.bin_boundaries))
     self.assertEqual(len(column_values),
                      hist.hist_dict['foo'][math.floor(nr_bins / 2)])