def test_compute_hist_multi_columns(self): expected_bins = np.linspace(1, 50, 11) kdf = ps.DataFrame( { "a": [1, 2, 3, 4, 5, 6, 7, 8, 9, 15, 50], "b": [50, 50, 30, 30, 30, 24, 10, 5, 4, 3, 1], } ) bins = HistogramPlotBase.get_bins(kdf.to_spark(), 10) self.assert_eq(pd.Series(expected_bins), pd.Series(bins)) expected_histograms = [ np.array([5, 4, 1, 0, 0, 0, 0, 0, 0, 1]), np.array([4, 1, 0, 0, 1, 3, 0, 0, 0, 2]), ] histograms = HistogramPlotBase.compute_hist(kdf, bins) expected_names = ["a", "b"] for histogram, expected_histogram, expected_name in zip( histograms, expected_histograms, expected_names ): self.assert_eq( pd.Series(expected_histogram, name=expected_name), histogram, almost=True )
def test_compute_hist_single_column(self): kdf = ps.DataFrame( {"a": [1, 2, 3, 4, 5, 6, 7, 8, 9, 15, 50]}, index=[0, 1, 3, 5, 6, 8, 9, 9, 9, 10, 10] ) expected_bins = np.linspace(1, 50, 11) bins = HistogramPlotBase.get_bins(kdf[["a"]].to_spark(), 10) expected_histogram = np.array([5, 4, 1, 0, 0, 0, 0, 0, 0, 1]) histogram = HistogramPlotBase.compute_hist(kdf[["a"]], bins)[0] self.assert_eq(pd.Series(expected_bins), pd.Series(bins)) self.assert_eq(pd.Series(expected_histogram, name="a"), histogram, almost=True)