Exemple #1
0
def plot_histogram(data: Union["ps.DataFrame", "ps.Series"], **kwargs):
    import plotly.graph_objs as go

    bins = kwargs.get("bins", 10)
    kdf, bins = HistogramPlotBase.prepare_hist_data(data, bins)
    assert len(bins) > 2, "the number of buckets must be higher than 2."
    output_series = HistogramPlotBase.compute_hist(kdf, bins)
    prev = float("%.9f" % bins[0])  # to make it prettier, truncate.
    text_bins = []
    for b in bins[1:]:
        norm_b = float("%.9f" % b)
        text_bins.append("[%s, %s)" % (prev, norm_b))
        prev = norm_b
    text_bins[
        -1] = text_bins[-1][:-1] + "]"  # replace ) to ] for the last bucket.

    bins = 0.5 * (bins[:-1] + bins[1:])

    output_series = list(output_series)
    bars = []
    for series in output_series:
        bars.append(
            go.Bar(
                x=bins,
                y=series,
                name=name_like_string(series.name),
                text=text_bins,
                hovertemplate=("variable=" + name_like_string(series.name) +
                               "<br>value=%{text}<br>count=%{y}"),
            ))

    fig = go.Figure(data=bars, layout=go.Layout(barmode="stack"))
    fig["layout"]["xaxis"]["title"] = "value"
    fig["layout"]["yaxis"]["title"] = "count"
    return fig
Exemple #2
0
    def test_compute_hist_multi_columns(self):
        expected_bins = np.linspace(1, 50, 11)
        kdf = ps.DataFrame(
            {
                "a": [1, 2, 3, 4, 5, 6, 7, 8, 9, 15, 50],
                "b": [50, 50, 30, 30, 30, 24, 10, 5, 4, 3, 1],
            }
        )

        bins = HistogramPlotBase.get_bins(kdf.to_spark(), 10)
        self.assert_eq(pd.Series(expected_bins), pd.Series(bins))

        expected_histograms = [
            np.array([5, 4, 1, 0, 0, 0, 0, 0, 0, 1]),
            np.array([4, 1, 0, 0, 1, 3, 0, 0, 0, 2]),
        ]
        histograms = HistogramPlotBase.compute_hist(kdf, bins)
        expected_names = ["a", "b"]

        for histogram, expected_histogram, expected_name in zip(
            histograms, expected_histograms, expected_names
        ):
            self.assert_eq(
                pd.Series(expected_histogram, name=expected_name), histogram, almost=True
            )
Exemple #3
0
    def _make_plot(self):
        # TODO: this logic is similar with KdePlot. Might have to deduplicate it.
        # 'num_colors' requires to calculate `shape` which has to count all.
        # Use 1 for now to save the computation.
        colors = self._get_colors(num_colors=1)
        stacking_id = self._get_stacking_id()
        output_series = HistogramPlotBase.compute_hist(self.data, self.bins)

        for (i, label), y in zip(enumerate(self.data._internal.column_labels),
                                 output_series):
            ax = self._get_ax(i)

            kwds = self.kwds.copy()

            label = pprint_thing(label if len(label) > 1 else label[0])
            kwds["label"] = label

            style, kwds = self._apply_style_colors(colors, kwds, i, label)
            if style is not None:
                kwds["style"] = style

            kwds = self._make_plot_keywords(kwds, y)
            artists = self._plot(ax,
                                 y,
                                 column_num=i,
                                 stacking_id=stacking_id,
                                 **kwds)
            self._add_legend_handle(artists[0], label, index=i)
Exemple #4
0
    def test_compute_hist_single_column(self):
        kdf = ps.DataFrame(
            {"a": [1, 2, 3, 4, 5, 6, 7, 8, 9, 15, 50]}, index=[0, 1, 3, 5, 6, 8, 9, 9, 9, 10, 10]
        )

        expected_bins = np.linspace(1, 50, 11)
        bins = HistogramPlotBase.get_bins(kdf[["a"]].to_spark(), 10)

        expected_histogram = np.array([5, 4, 1, 0, 0, 0, 0, 0, 0, 1])
        histogram = HistogramPlotBase.compute_hist(kdf[["a"]], bins)[0]
        self.assert_eq(pd.Series(expected_bins), pd.Series(bins))
        self.assert_eq(pd.Series(expected_histogram, name="a"), histogram, almost=True)
Exemple #5
0
def plot_histogram(data: Union["ps.DataFrame", "ps.Series"], **kwargs):
    import plotly.graph_objs as go
    import pyspark.pandas as ps

    bins = kwargs.get("bins", 10)
    y = kwargs.get("y")
    if y and isinstance(data, ps.DataFrame):
        # Note that the results here are matched with matplotlib. x and y
        # handling is different from pandas' plotly output.
        data = data[y]
    psdf, bins = HistogramPlotBase.prepare_hist_data(data, bins)
    assert len(bins) > 2, "the number of buckets must be higher than 2."
    output_series = HistogramPlotBase.compute_hist(psdf, bins)
    prev = float("%.9f" % bins[0])  # to make it prettier, truncate.
    text_bins = []
    for b in bins[1:]:
        norm_b = float("%.9f" % b)
        text_bins.append("[%s, %s)" % (prev, norm_b))
        prev = norm_b
    text_bins[
        -1] = text_bins[-1][:-1] + "]"  # replace ) to ] for the last bucket.

    bins = 0.5 * (bins[:-1] + bins[1:])

    output_series = list(output_series)
    bars = []
    for series in output_series:
        bars.append(
            go.Bar(
                x=bins,
                y=series,
                name=name_like_string(series.name),
                text=text_bins,
                hovertemplate=("variable=" + name_like_string(series.name) +
                               "<br>value=%{text}<br>count=%{y}"),
            ))

    layout_keys = inspect.signature(go.Layout).parameters.keys()
    layout_kwargs = {k: v for k, v in kwargs.items() if k in layout_keys}

    fig = go.Figure(data=bars, layout=go.Layout(**layout_kwargs))
    fig["layout"]["barmode"] = "stack"
    fig["layout"]["xaxis"]["title"] = "value"
    fig["layout"]["yaxis"]["title"] = "count"
    return fig