Beispiel #1
0
        def check_box_summary(psdf, pdf):
            k = 1.5
            stats, fences = BoxPlotBase.compute_stats(psdf["a"],
                                                      "a",
                                                      whis=k,
                                                      precision=0.01)
            outliers = BoxPlotBase.outliers(psdf["a"], "a", *fences)
            whiskers = BoxPlotBase.calc_whiskers("a", outliers)
            fliers = BoxPlotBase.get_fliers("a", outliers, whiskers[0])

            expected_mean = pdf["a"].mean()
            expected_median = pdf["a"].median()
            expected_q1 = np.percentile(pdf["a"], 25)
            expected_q3 = np.percentile(pdf["a"], 75)
            iqr = expected_q3 - expected_q1
            expected_fences = (expected_q1 - k * iqr, expected_q3 + k * iqr)
            pdf["outlier"] = ~pdf["a"].between(fences[0], fences[1])
            expected_whiskers = (
                pdf.query("not outlier")["a"].min(),
                pdf.query("not outlier")["a"].max(),
            )
            expected_fliers = pdf.query("outlier")["a"].values

            self.assertEqual(expected_mean, stats["mean"])
            self.assertEqual(expected_median, stats["med"])
            self.assertEqual(expected_q1, stats["q1"] + 0.5)
            self.assertEqual(expected_q3, stats["q3"] - 0.5)
            self.assertEqual(expected_fences[0], fences[0] + 2.0)
            self.assertEqual(expected_fences[1], fences[1] - 2.0)
            self.assertEqual(expected_whiskers[0], whiskers[0])
            self.assertEqual(expected_whiskers[1], whiskers[1])
            self.assertEqual(expected_fliers, fliers)
Beispiel #2
0
    def _compute_plot_data(self):
        colname = self.data.name
        spark_column_name = self.data._internal.spark_column_name_for(
            self.data._column_label)
        data = self.data

        # Updates all props with the rc defaults from matplotlib
        self.kwds.update(PandasOnSparkBoxPlot.rc_defaults(**self.kwds))

        # Gets some important kwds
        showfliers = self.kwds.get("showfliers", False)
        whis = self.kwds.get("whis", 1.5)
        labels = self.kwds.get("labels", [colname])

        # This one is pandas-on-Spark specific to control precision for approx_percentile
        precision = self.kwds.get("precision", 0.01)

        # # Computes mean, median, Q1 and Q3 with approx_percentile and precision
        col_stats, col_fences = BoxPlotBase.compute_stats(
            data, spark_column_name, whis, precision)

        # # Creates a column to flag rows as outliers or not
        outliers = BoxPlotBase.outliers(data, spark_column_name, *col_fences)

        # # Computes min and max values of non-outliers - the whiskers
        whiskers = BoxPlotBase.calc_whiskers(spark_column_name, outliers)

        if showfliers:
            fliers = BoxPlotBase.get_fliers(spark_column_name, outliers,
                                            whiskers[0])
        else:
            fliers = []

        # Builds bxpstats dict
        stats = []
        item = {
            "mean": col_stats["mean"],
            "med": col_stats["med"],
            "q1": col_stats["q1"],
            "q3": col_stats["q3"],
            "whislo": whiskers[0],
            "whishi": whiskers[1],
            "fliers": fliers,
            "label": labels[0],
        }
        stats.append(item)

        self.data = {labels[0]: stats}
Beispiel #3
0
        def check_box_multi_columns(psdf):
            k = 1.5
            multicol_stats = BoxPlotBase.compute_multicol_stats(
                psdf, ["a", "b", "c"], whis=k, precision=0.01)
            multicol_outliers = BoxPlotBase.multicol_outliers(
                psdf, multicol_stats)
            multicol_whiskers = BoxPlotBase.calc_multicol_whiskers(
                ["a", "b", "c"], multicol_outliers)

            for col in ["a", "b", "c"]:
                col_stats = multicol_stats[col]
                col_whiskers = multicol_whiskers[col]

                stats, fences = BoxPlotBase.compute_stats(psdf[col],
                                                          col,
                                                          whis=k,
                                                          precision=0.01)
                outliers = BoxPlotBase.outliers(psdf[col], col, *fences)
                whiskers = BoxPlotBase.calc_whiskers(col, outliers)

                self.assertEqual(stats["mean"], col_stats["mean"])
                self.assertEqual(stats["med"], col_stats["med"])
                self.assertEqual(stats["q1"], col_stats["q1"])
                self.assertEqual(stats["q3"], col_stats["q3"])
                self.assertEqual(fences[0], col_stats["lfence"])
                self.assertEqual(fences[1], col_stats["ufence"])
                self.assertEqual(whiskers[0], col_whiskers["min"])
                self.assertEqual(whiskers[1], col_whiskers["max"])
Beispiel #4
0
def plot_box(data: Union["ps.DataFrame", "ps.Series"], **kwargs):
    import plotly.graph_objs as go
    import pyspark.pandas as ps

    if isinstance(data, ps.DataFrame):
        raise RuntimeError(
            "plotly does not support a box plot with Koalas DataFrame. Use Series instead."
        )

    # 'whis' isn't actually an argument in plotly (but in matplotlib). But seems like
    # plotly doesn't expose the reach of the whiskers to the beyond the first and
    # third quartiles (?). Looks they use default 1.5.
    whis = kwargs.pop("whis", 1.5)
    # 'precision' is Koalas specific to control precision for approx_percentile
    precision = kwargs.pop("precision", 0.01)

    # Plotly options
    boxpoints = kwargs.pop("boxpoints", "suspectedoutliers")
    notched = kwargs.pop("notched", False)
    if boxpoints not in ["suspectedoutliers", False]:
        raise ValueError(
            "plotly plotting backend does not support 'boxpoints' set to '%s'. "
            "Set to 'suspectedoutliers' or False." % boxpoints)
    if notched:
        raise ValueError(
            "plotly plotting backend does not support 'notched' set to '%s'. "
            "Set to False." % notched)

    colname = name_like_string(data.name)
    spark_column_name = data._internal.spark_column_name_for(
        data._column_label)

    # Computes mean, median, Q1 and Q3 with approx_percentile and precision
    col_stats, col_fences = BoxPlotBase.compute_stats(data, spark_column_name,
                                                      whis, precision)

    # Creates a column to flag rows as outliers or not
    outliers = BoxPlotBase.outliers(data, spark_column_name, *col_fences)

    # Computes min and max values of non-outliers - the whiskers
    whiskers = BoxPlotBase.calc_whiskers(spark_column_name, outliers)

    fliers = None
    if boxpoints:
        fliers = BoxPlotBase.get_fliers(spark_column_name, outliers,
                                        whiskers[0])
        fliers = [fliers] if len(fliers) > 0 else None

    fig = go.Figure()
    fig.add_trace(
        go.Box(
            name=colname,
            q1=[col_stats["q1"]],
            median=[col_stats["med"]],
            q3=[col_stats["q3"]],
            mean=[col_stats["mean"]],
            lowerfence=[whiskers[0]],
            upperfence=[whiskers[1]],
            y=fliers,
            boxpoints=boxpoints,
            notched=notched,
            **
            kwargs,  # this is for workarounds. Box takes different options from express.box.
        ))
    fig["layout"]["xaxis"]["title"] = colname
    fig["layout"]["yaxis"]["title"] = "value"
    return fig