def test_calculate_bins_multiple_columns(self): """Should return a list of evenly spaced bins between the smallest and highest value over all columns""" hist = Histogram(bins=3) test_df = self.create_test_df( ) # The lowest value in this DF is 1, the highest is 4 hist.add_column(test_df.select(F.col('value'))) hist.add_column(test_df.select(F.col('value2'))) self.assertListEqual([1, 2, 3, 4], hist._calculate_bins())
def test_add_hist_single_column_sets_bin_list(self): """Should set the bin list if this is a single number""" hist = Histogram(bins=2) test_df = self.create_test_df() column_to_ad = test_df.select(F.col('value')) hist.add_column(column_to_ad) hist.bin_list = hist._calculate_bins() hist._add_hist(column_to_ad, 'value') self.assertEqual(3, len(hist.bin_list))
def test_add_hist_single_column(self): """Should add a list of bin values (e.g. the number of values that fall in a bin) to the hist_dict, where the key is the column name. If multiple columns have the same name a number is appended""" hist = Histogram(bins=2) test_df = self.create_test_df() column_to_ad = test_df.select(F.col('value')) hist.add_column(column_to_ad) hist.bin_list = hist._calculate_bins() hist._add_hist(column_to_ad, 'value') self.assertEqual(1, len(hist.hist_dict)) self.assertListEqual([1, 2], hist.hist_dict['value'])
def test_add_hist_multiple_column_rename_column(self): """Should rename the column name if the same column name is added""" hist = Histogram(bins=2) test_df = self.create_test_df() column_to_ad = test_df.select(F.col('value')) column_to_ad_2 = test_df.select(F.col('value')) hist.add_column(column_to_ad) hist.add_column(column_to_ad_2) hist.bin_list = hist._calculate_bins() hist._add_hist(column_to_ad, 'value') hist._add_hist(column_to_ad_2, 'value') self.assertEqual(2, len(hist.hist_dict)) self.assertTrue('value (1)' in hist.hist_dict)
def test_add_hist_multiple_column(self): """Should add a second list of bin values to the hist_dict""" hist = Histogram(bins=2) test_df = self.create_test_df() column_to_ad = test_df.select(F.col('value')) column_to_ad_2 = test_df.select(F.col('value2')) hist.add_column(column_to_ad) hist.add_column(column_to_ad_2) hist.bin_list = hist._calculate_bins() hist._add_hist(column_to_ad, 'value') hist._add_hist(column_to_ad_2, 'value2') self.assertEqual(2, len(hist.hist_dict)) self.assertListEqual([1, 2], hist.hist_dict['value2'])
def test_calculate_bins_single_column(self): """Should return the number of bins when there is only a single column, and no min and max is set""" hist = Histogram(bins=5) test_df = self.create_test_df() hist.add_column(test_df.select(F.col('value'))) self.assertEqual(5, hist._calculate_bins())
def test_calculate_bins_bins_set(self): """Should just return the list of bins edges when this was set in the constructor""" hist = Histogram(bins=[1, 2, 3]) self.assertListEqual([1, 2, 3], hist._calculate_bins())
def test_calculate_bins(self): """Should return a list of evenly spaced bins between min and max bin if they are set""" hist = Histogram(range=(5, 10), bins=2) self.assertListEqual([5, 7.5, 10], hist._calculate_bins())