def check_box_summary(psdf, pdf): k = 1.5 stats, fences = BoxPlotBase.compute_stats(psdf["a"], "a", whis=k, precision=0.01) outliers = BoxPlotBase.outliers(psdf["a"], "a", *fences) whiskers = BoxPlotBase.calc_whiskers("a", outliers) fliers = BoxPlotBase.get_fliers("a", outliers, whiskers[0]) expected_mean = pdf["a"].mean() expected_median = pdf["a"].median() expected_q1 = np.percentile(pdf["a"], 25) expected_q3 = np.percentile(pdf["a"], 75) iqr = expected_q3 - expected_q1 expected_fences = (expected_q1 - k * iqr, expected_q3 + k * iqr) pdf["outlier"] = ~pdf["a"].between(fences[0], fences[1]) expected_whiskers = ( pdf.query("not outlier")["a"].min(), pdf.query("not outlier")["a"].max(), ) expected_fliers = pdf.query("outlier")["a"].values self.assertEqual(expected_mean, stats["mean"]) self.assertEqual(expected_median, stats["med"]) self.assertEqual(expected_q1, stats["q1"] + 0.5) self.assertEqual(expected_q3, stats["q3"] - 0.5) self.assertEqual(expected_fences[0], fences[0] + 2.0) self.assertEqual(expected_fences[1], fences[1] - 2.0) self.assertEqual(expected_whiskers[0], whiskers[0]) self.assertEqual(expected_whiskers[1], whiskers[1]) self.assertEqual(expected_fliers, fliers)
def _compute_plot_data(self): colname = self.data.name spark_column_name = self.data._internal.spark_column_name_for( self.data._column_label) data = self.data # Updates all props with the rc defaults from matplotlib self.kwds.update(PandasOnSparkBoxPlot.rc_defaults(**self.kwds)) # Gets some important kwds showfliers = self.kwds.get("showfliers", False) whis = self.kwds.get("whis", 1.5) labels = self.kwds.get("labels", [colname]) # This one is pandas-on-Spark specific to control precision for approx_percentile precision = self.kwds.get("precision", 0.01) # # Computes mean, median, Q1 and Q3 with approx_percentile and precision col_stats, col_fences = BoxPlotBase.compute_stats( data, spark_column_name, whis, precision) # # Creates a column to flag rows as outliers or not outliers = BoxPlotBase.outliers(data, spark_column_name, *col_fences) # # Computes min and max values of non-outliers - the whiskers whiskers = BoxPlotBase.calc_whiskers(spark_column_name, outliers) if showfliers: fliers = BoxPlotBase.get_fliers(spark_column_name, outliers, whiskers[0]) else: fliers = [] # Builds bxpstats dict stats = [] item = { "mean": col_stats["mean"], "med": col_stats["med"], "q1": col_stats["q1"], "q3": col_stats["q3"], "whislo": whiskers[0], "whishi": whiskers[1], "fliers": fliers, "label": labels[0], } stats.append(item) self.data = {labels[0]: stats}
def check_box_multi_columns(psdf): k = 1.5 multicol_stats = BoxPlotBase.compute_multicol_stats( psdf, ["a", "b", "c"], whis=k, precision=0.01) multicol_outliers = BoxPlotBase.multicol_outliers( psdf, multicol_stats) multicol_whiskers = BoxPlotBase.calc_multicol_whiskers( ["a", "b", "c"], multicol_outliers) for col in ["a", "b", "c"]: col_stats = multicol_stats[col] col_whiskers = multicol_whiskers[col] stats, fences = BoxPlotBase.compute_stats(psdf[col], col, whis=k, precision=0.01) outliers = BoxPlotBase.outliers(psdf[col], col, *fences) whiskers = BoxPlotBase.calc_whiskers(col, outliers) self.assertEqual(stats["mean"], col_stats["mean"]) self.assertEqual(stats["med"], col_stats["med"]) self.assertEqual(stats["q1"], col_stats["q1"]) self.assertEqual(stats["q3"], col_stats["q3"]) self.assertEqual(fences[0], col_stats["lfence"]) self.assertEqual(fences[1], col_stats["ufence"]) self.assertEqual(whiskers[0], col_whiskers["min"]) self.assertEqual(whiskers[1], col_whiskers["max"])
def plot_box(data: Union["ps.DataFrame", "ps.Series"], **kwargs): import plotly.graph_objs as go import pyspark.pandas as ps if isinstance(data, ps.DataFrame): raise RuntimeError( "plotly does not support a box plot with Koalas DataFrame. Use Series instead." ) # 'whis' isn't actually an argument in plotly (but in matplotlib). But seems like # plotly doesn't expose the reach of the whiskers to the beyond the first and # third quartiles (?). Looks they use default 1.5. whis = kwargs.pop("whis", 1.5) # 'precision' is Koalas specific to control precision for approx_percentile precision = kwargs.pop("precision", 0.01) # Plotly options boxpoints = kwargs.pop("boxpoints", "suspectedoutliers") notched = kwargs.pop("notched", False) if boxpoints not in ["suspectedoutliers", False]: raise ValueError( "plotly plotting backend does not support 'boxpoints' set to '%s'. " "Set to 'suspectedoutliers' or False." % boxpoints) if notched: raise ValueError( "plotly plotting backend does not support 'notched' set to '%s'. " "Set to False." % notched) colname = name_like_string(data.name) spark_column_name = data._internal.spark_column_name_for( data._column_label) # Computes mean, median, Q1 and Q3 with approx_percentile and precision col_stats, col_fences = BoxPlotBase.compute_stats(data, spark_column_name, whis, precision) # Creates a column to flag rows as outliers or not outliers = BoxPlotBase.outliers(data, spark_column_name, *col_fences) # Computes min and max values of non-outliers - the whiskers whiskers = BoxPlotBase.calc_whiskers(spark_column_name, outliers) fliers = None if boxpoints: fliers = BoxPlotBase.get_fliers(spark_column_name, outliers, whiskers[0]) fliers = [fliers] if len(fliers) > 0 else None fig = go.Figure() fig.add_trace( go.Box( name=colname, q1=[col_stats["q1"]], median=[col_stats["med"]], q3=[col_stats["q3"]], mean=[col_stats["mean"]], lowerfence=[whiskers[0]], upperfence=[whiskers[1]], y=fliers, boxpoints=boxpoints, notched=notched, ** kwargs, # this is for workarounds. Box takes different options from express.box. )) fig["layout"]["xaxis"]["title"] = colname fig["layout"]["yaxis"]["title"] = "value" return fig