def test_get_max_value(self): """Should return the maximum value over all columns in a Histogram""" hist = Histogram(bins=10) test_df = self.create_test_df() hist.add_column(test_df.select(F.col('value'))) hist.add_column(test_df.select(F.col('value2'))) self.assertEqual(4, hist._get_max_value())
def test_calculate_bins_multiple_columns(self): """Should return a list of evenly spaced bins between the smallest and highest value over all columns""" hist = Histogram(bins=3) test_df = self.create_test_df( ) # The lowest value in this DF is 1, the highest is 4 hist.add_column(test_df.select(F.col('value'))) hist.add_column(test_df.select(F.col('value2'))) self.assertListEqual([1, 2, 3, 4], hist._calculate_bins())
def test_add_column_non_numeric(self): """Should raise an ValueError if a non-numeric column is added""" test_list = ['a', 'b'] rdd = self.sc.parallelize(test_list) rdd_f = rdd.map(lambda x: Row(value=x)) spark_df = self.sqlCtx.createDataFrame(rdd_f) hist = Histogram() with self.assertRaises(ValueError): hist.add_column(spark_df)
def test_add_column(self): """"Should add a column name, column tuple to the col_list when a single column data frame is given""" hist = Histogram(bins=10) test_df = self.create_test_df() hist.add_column(test_df.select(F.col('value'))) self.assertEqual(1, len(hist.col_list)) self.assertEqual('value', hist.col_list[0][1]) self.assertDataFrameEqual(test_df.select(F.col('value')), hist.col_list[0][0])
def test_add_hist_single_column_sets_bin_list(self): """Should set the bin list if this is a single number""" hist = Histogram(bins=2) test_df = self.create_test_df() column_to_ad = test_df.select(F.col('value')) hist.add_column(column_to_ad) hist.bin_list = hist._calculate_bins() hist._add_hist(column_to_ad, 'value') self.assertEqual(3, len(hist.bin_list))
def test_add_hist_single_column(self): """Should add a list of bin values (e.g. the number of values that fall in a bin) to the hist_dict, where the key is the column name. If multiple columns have the same name a number is appended""" hist = Histogram(bins=2) test_df = self.create_test_df() column_to_ad = test_df.select(F.col('value')) hist.add_column(column_to_ad) hist.bin_list = hist._calculate_bins() hist._add_hist(column_to_ad, 'value') self.assertEqual(1, len(hist.hist_dict)) self.assertListEqual([1, 2], hist.hist_dict['value'])
def test_add_multiple_columns(self): """Adds new items to the col_list when new items are added""" hist = Histogram(bins=10) test_df = self.create_test_df() hist.add_column(test_df.select(F.col('value'))) hist.add_column(test_df.select(F.col('value2'))) self.assertEqual(2, len(hist.col_list)) self.assertEqual('value', hist.col_list[0][1]) self.assertDataFrameEqual(test_df.select(F.col('value')), hist.col_list[0][0]) self.assertEqual('value2', hist.col_list[1][1]) self.assertDataFrameEqual(test_df.select(F.col('value2')), hist.col_list[1][0])
def test_to_pandas_density(self): """Should create a pandas dataframe of a denisty plot of the histogram""" hist = Histogram(bins=2) test_df = self.create_test_df() column_to_ad = test_df.select(F.col('value')) column_to_ad_2 = test_df.select(F.col('value2')) hist.add_column(column_to_ad) hist.add_column(column_to_ad_2) expected_df = pd.DataFrame({ 'value': [1.0, 0.5], 'value2': [0.5, 1.0] }).set_index([[1.75, 3.25]]) self.assertTrue(expected_df.equals(hist.to_pandas('density')))
def test_to_pandas_default(self): """Should create a pandas dataframe from the Histogram object""" hist = Histogram(bins=2) test_df = self.create_test_df() column_to_ad = test_df.select(F.col('value')) column_to_ad_2 = test_df.select(F.col('value2')) hist.add_column(column_to_ad) hist.add_column(column_to_ad_2) expected_df = pd.DataFrame({ 'value': [2, 1], 'value2': [1, 2] }).set_index([['1.00 - 2.50', '2.50 - 4.00']]) self.assertTrue(expected_df.equals(hist.to_pandas()))
def test_build(self): """Should calculate the bin list, and hist values for each column in the Histogram, if the histogram hasn't been build before""" hist = Histogram(bins=2) test_df = self.create_test_df() column_to_ad = test_df.select(F.col('value')) column_to_ad_2 = test_df.select(F.col('value2')) hist.add_column(column_to_ad) hist.add_column(column_to_ad_2) hist.build() self.assertEqual(3, len(hist.bin_list)) self.assertEqual(2, len(hist.hist_dict)) self.assertTrue(hist.is_build)
def test_add_hist_multiple_column_rename_column(self): """Should rename the column name if the same column name is added""" hist = Histogram(bins=2) test_df = self.create_test_df() column_to_ad = test_df.select(F.col('value')) column_to_ad_2 = test_df.select(F.col('value')) hist.add_column(column_to_ad) hist.add_column(column_to_ad_2) hist.bin_list = hist._calculate_bins() hist._add_hist(column_to_ad, 'value') hist._add_hist(column_to_ad_2, 'value') self.assertEqual(2, len(hist.hist_dict)) self.assertTrue('value (1)' in hist.hist_dict)
def test_add_hist_multiple_column(self): """Should add a second list of bin values to the hist_dict""" hist = Histogram(bins=2) test_df = self.create_test_df() column_to_ad = test_df.select(F.col('value')) column_to_ad_2 = test_df.select(F.col('value2')) hist.add_column(column_to_ad) hist.add_column(column_to_ad_2) hist.bin_list = hist._calculate_bins() hist._add_hist(column_to_ad, 'value') hist._add_hist(column_to_ad_2, 'value2') self.assertEqual(2, len(hist.hist_dict)) self.assertListEqual([1, 2], hist.hist_dict['value2'])
def test_add_hist_single_value(self): """Should set the bin list to n (self.nr_bins) bins (n+1 bin borders) where the min bin border is the single value -0.5 and the max bin border is the single value +0.5 incase a column is input with only a single value""" single_column_value = 1 nr_bins = 5 column_values = [single_column_value] * 100 test_df = self.sqlCtx.createDataFrame( pd.DataFrame({'foo': column_values})) hist = Histogram(bins=nr_bins) hist.add_column(test_df.select(F.col('foo'))) hist.build() self.assertEqual(6, len(hist.bin_boundaries)) self.assertEqual(single_column_value - 0.5, min(hist.bin_boundaries)) self.assertEqual(single_column_value + 0.5, max(hist.bin_boundaries)) self.assertEqual(len(column_values), hist.hist_dict['foo'][math.floor(nr_bins / 2)])
def test_add_column_more_then_1_column_in_dataframe(self): """"Should throw an error when the input data frame contains more then one column""" hist = Histogram(bins=10) test_df = self.create_test_df() with self.assertRaises(ValueError): hist.add_column(test_df)
def test_calculate_bins_single_column(self): """Should return the number of bins when there is only a single column, and no min and max is set""" hist = Histogram(bins=5) test_df = self.create_test_df() hist.add_column(test_df.select(F.col('value'))) self.assertEqual(5, hist._calculate_bins())