Beispiel #1
0
def _grouped_plot(plotf, data, column=None, by=None, numeric_only=True,
                  figsize=None, sharex=True, sharey=True, layout=None,
                  rot=0, ax=None, **kwargs):

    if figsize == 'default':
        # allowed to specify mpl default with 'default'
        warnings.warn("figsize='default' is deprecated. Specify figure "
                      "size by tuple instead", FutureWarning, stacklevel=5)
        figsize = None

    grouped = data.groupby(by)
    if column is not None:
        grouped = grouped[column]

    naxes = len(grouped)
    fig, axes = _subplots(naxes=naxes, figsize=figsize,
                          sharex=sharex, sharey=sharey, ax=ax,
                          layout=layout)

    _axes = _flatten(axes)

    for i, (key, group) in enumerate(grouped):
        ax = _axes[i]
        if numeric_only and isinstance(group, ABCDataFrame):
            group = group._get_numeric_data()
        plotf(group, ax, **kwargs)
        ax.set_title(pprint_thing(key))

    return fig, axes
Beispiel #2
0
def boxplot_frame_groupby(grouped, subplots=True, column=None, fontsize=None,
                          rot=0, grid=True, ax=None, figsize=None,
                          layout=None, sharex=False, sharey=True, **kwds):
    if subplots is True:
        naxes = len(grouped)
        fig, axes = _subplots(naxes=naxes, squeeze=False,
                              ax=ax, sharex=sharex, sharey=sharey,
                              figsize=figsize, layout=layout)
        axes = _flatten(axes)

        from pandas.core.series import Series
        ret = Series()
        for (key, group), ax in zip(grouped, axes):
            d = group.boxplot(ax=ax, column=column, fontsize=fontsize,
                              rot=rot, grid=grid, **kwds)
            ax.set_title(pprint_thing(key))
            ret.loc[key] = d
        fig.subplots_adjust(bottom=0.15, top=0.9, left=0.1,
                            right=0.9, wspace=0.2)
    else:
        from pandas.core.reshape.concat import concat
        keys, frames = zip(*grouped)
        if grouped.axis == 0:
            df = concat(frames, keys=keys, axis=1)
        else:
            if len(frames) > 1:
                df = frames[0].join(frames[1::])
            else:
                df = frames[0]
        ret = df.boxplot(column=column, fontsize=fontsize, rot=rot,
                         grid=grid, ax=ax, figsize=figsize,
                         layout=layout, **kwds)
    return ret
Beispiel #3
0
def _grouped_plot_by_column(
    plotf,
    data,
    columns=None,
    by=None,
    numeric_only=True,
    grid=False,
    figsize=None,
    ax=None,
    layout=None,
    return_type=None,
    **kwargs,
):
    grouped = data.groupby(by)
    if columns is None:
        if not isinstance(by, (list, tuple)):
            by = [by]
        columns = data._get_numeric_data().columns.difference(by)
    naxes = len(columns)
    fig, axes = _subplots(naxes=naxes,
                          sharex=True,
                          sharey=True,
                          figsize=figsize,
                          ax=ax,
                          layout=layout)

    _axes = _flatten(axes)

    ax_values = []

    for i, col in enumerate(columns):
        ax = _axes[i]
        gp_col = grouped[col]
        keys, values = zip(*gp_col)
        re_plotf = plotf(keys, values, ax, **kwargs)
        ax.set_title(col)
        ax.set_xlabel(pprint_thing(by))
        ax_values.append(re_plotf)
        ax.grid(grid)

    result = pd.Series(ax_values, index=columns)

    # Return axes in multiplot case, maybe revisit later # 985
    if return_type is None:
        result = axes

    byline = by[0] if len(by) == 1 else by
    fig.suptitle(f"Boxplot grouped by {byline}")
    fig.subplots_adjust(bottom=0.15, top=0.9, left=0.1, right=0.9, wspace=0.2)

    return result
Beispiel #4
0
def _grouped_plot(plotf,
                  data,
                  column=None,
                  by=None,
                  numeric_only=True,
                  figsize=None,
                  sharex=True,
                  sharey=True,
                  layout=None,
                  rot=0,
                  ax=None,
                  **kwargs):

    if figsize == "default":
        # allowed to specify mpl default with 'default'
        warnings.warn(
            "figsize='default' is deprecated. Specify figure "
            "size by tuple instead",
            FutureWarning,
            stacklevel=5,
        )
        figsize = None

    grouped = data.groupby(by)
    if column is not None:
        grouped = grouped[column]

    naxes = len(grouped)
    fig, axes = _subplots(naxes=naxes,
                          figsize=figsize,
                          sharex=sharex,
                          sharey=sharey,
                          ax=ax,
                          layout=layout)

    _axes = _flatten(axes)

    for i, (key, group) in enumerate(grouped):
        ax = _axes[i]
        if numeric_only and isinstance(group, ABCDataFrame):
            group = group._get_numeric_data()
        plotf(group, ax, **kwargs)
        ax.set_title(pprint_thing(key))

    return fig, axes
Beispiel #5
0
    def _setup_subplots(self):
        if self.subplots:
            fig, axes = _subplots(
                naxes=self.nseries,
                sharex=self.sharex,
                sharey=self.sharey,
                figsize=self.figsize,
                ax=self.ax,
                layout=self.layout,
                layout_type=self._layout_type,
            )
        else:
            if self.ax is None:
                fig = self.plt.figure(figsize=self.figsize)
                axes = fig.add_subplot(111)
            else:
                fig = self.ax.get_figure()
                if self.figsize is not None:
                    fig.set_size_inches(self.figsize)
                axes = self.ax

        axes = _flatten(axes)

        valid_log = {False, True, "sym", None}
        input_log = {self.logx, self.logy, self.loglog}
        if input_log - valid_log:
            invalid_log = next(iter((input_log - valid_log)))
            raise ValueError(
                "Boolean, None and 'sym' are valid options,"
                " '{}' is given.".format(invalid_log)
            )

        if self.logx is True or self.loglog is True:
            [a.set_xscale("log") for a in axes]
        elif self.logx == "sym" or self.loglog == "sym":
            [a.set_xscale("symlog") for a in axes]

        if self.logy is True or self.loglog is True:
            [a.set_yscale("log") for a in axes]
        elif self.logy == "sym" or self.loglog == "sym":
            [a.set_yscale("symlog") for a in axes]

        self.fig = fig
        self.axes = axes
Beispiel #6
0
def hist_frame(data, column=None, by=None, grid=True, xlabelsize=None,
               xrot=None, ylabelsize=None, yrot=None, ax=None, sharex=False,
               sharey=False, figsize=None, layout=None, bins=10, **kwds):
    if by is not None:
        axes = _grouped_hist(data, column=column, by=by, ax=ax, grid=grid,
                             figsize=figsize, sharex=sharex, sharey=sharey,
                             layout=layout, bins=bins, xlabelsize=xlabelsize,
                             xrot=xrot, ylabelsize=ylabelsize,
                             yrot=yrot, **kwds)
        return axes

    if column is not None:
        if not isinstance(column, (list, np.ndarray, ABCIndexClass)):
            column = [column]
        data = data[column]
    data = data._get_numeric_data()
    naxes = len(data.columns)

    if naxes == 0:
        raise ValueError("hist method requires numerical columns, "
                         "nothing to plot.")

    fig, axes = _subplots(naxes=naxes, ax=ax, squeeze=False,
                          sharex=sharex, sharey=sharey, figsize=figsize,
                          layout=layout)
    _axes = _flatten(axes)

    for i, col in enumerate(com.try_sort(data.columns)):
        ax = _axes[i]
        ax.hist(data[col].dropna().values, bins=bins, **kwds)
        ax.set_title(col)
        ax.grid(grid)

    _set_ticks_props(axes, xlabelsize=xlabelsize, xrot=xrot,
                     ylabelsize=ylabelsize, yrot=yrot)
    fig.subplots_adjust(wspace=0.3, hspace=0.3)

    return axes
Beispiel #7
0
def scatter_matrix(
    frame,
    alpha=0.5,
    figsize=None,
    ax=None,
    grid=False,
    diagonal="hist",
    marker=".",
    density_kwds=None,
    hist_kwds=None,
    range_padding=0.05,
    **kwds,
):
    df = frame._get_numeric_data()
    n = df.columns.size
    naxes = n * n
    fig, axes = _subplots(naxes=naxes, figsize=figsize, ax=ax, squeeze=False)

    # no gaps between subplots
    fig.subplots_adjust(wspace=0, hspace=0)

    mask = notna(df)

    marker = _get_marker_compat(marker)

    hist_kwds = hist_kwds or {}
    density_kwds = density_kwds or {}

    # GH 14855
    kwds.setdefault("edgecolors", "none")

    boundaries_list = []
    for a in df.columns:
        values = df[a].values[mask[a].values]
        rmin_, rmax_ = np.min(values), np.max(values)
        rdelta_ext = (rmax_ - rmin_) * range_padding / 2.0
        boundaries_list.append((rmin_ - rdelta_ext, rmax_ + rdelta_ext))

    for i, a in enumerate(df.columns):
        for j, b in enumerate(df.columns):
            ax = axes[i, j]

            if i == j:
                values = df[a].values[mask[a].values]

                # Deal with the diagonal by drawing a histogram there.
                if diagonal == "hist":
                    ax.hist(values, **hist_kwds)

                elif diagonal in ("kde", "density"):
                    from scipy.stats import gaussian_kde

                    y = values
                    gkde = gaussian_kde(y)
                    ind = np.linspace(y.min(), y.max(), 1000)
                    ax.plot(ind, gkde.evaluate(ind), **density_kwds)

                ax.set_xlim(boundaries_list[i])

            else:
                common = (mask[a] & mask[b]).values

                ax.scatter(df[b][common],
                           df[a][common],
                           marker=marker,
                           alpha=alpha,
                           **kwds)

                ax.set_xlim(boundaries_list[j])
                ax.set_ylim(boundaries_list[i])

            ax.set_xlabel(b)
            ax.set_ylabel(a)

            if j != 0:
                ax.yaxis.set_visible(False)
            if i != n - 1:
                ax.xaxis.set_visible(False)

    if len(df.columns) > 1:
        lim1 = boundaries_list[0]
        locs = axes[0][1].yaxis.get_majorticklocs()
        locs = locs[(lim1[0] <= locs) & (locs <= lim1[1])]
        adj = (locs - lim1[0]) / (lim1[1] - lim1[0])

        lim0 = axes[0][0].get_ylim()
        adj = adj * (lim0[1] - lim0[0]) + lim0[0]
        axes[0][0].yaxis.set_ticks(adj)

        if np.all(locs == locs.astype(int)):
            # if all ticks are int
            locs = locs.astype(int)
        axes[0][0].yaxis.set_ticklabels(locs)

    _set_ticks_props(axes, xlabelsize=8, xrot=90, ylabelsize=8, yrot=0)

    return axes
Beispiel #8
0
def hist_frame(
    data,
    column=None,
    by=None,
    grid=True,
    xlabelsize=None,
    xrot=None,
    ylabelsize=None,
    yrot=None,
    ax=None,
    sharex=False,
    sharey=False,
    figsize=None,
    layout=None,
    bins=10,
    **kwds,
):
    # Start with empty pandas data frame derived from
    ed_df_bins, ed_df_weights = data._hist(num_bins=bins)

    converter._WARN = False  # no warning for pandas plots
    if by is not None:
        raise NotImplementedError("TODO")

    if column is not None:
        if not isinstance(column, (list, np.ndarray, ABCIndexClass)):
            column = [column]
        ed_df_bins = ed_df_bins[column]
        ed_df_weights = ed_df_weights[column]
    naxes = len(ed_df_bins.columns)

    if naxes == 0:
        raise ValueError("hist method requires numerical columns, " "nothing to plot.")

    fig, axes = _subplots(
        naxes=naxes,
        ax=ax,
        squeeze=False,
        sharex=sharex,
        sharey=sharey,
        figsize=figsize,
        layout=layout,
    )
    _axes = _flatten(axes)

    for i, col in enumerate(com.try_sort(data.columns)):
        ax = _axes[i]
        ax.hist(
            ed_df_bins[col][:-1],
            bins=ed_df_bins[col],
            weights=ed_df_weights[col],
            **kwds,
        )
        ax.set_title(col)
        ax.grid(grid)

    _set_ticks_props(
        axes, xlabelsize=xlabelsize, xrot=xrot, ylabelsize=ylabelsize, yrot=yrot
    )
    fig.subplots_adjust(wspace=0.3, hspace=0.3)

    return axes
Beispiel #9
0
def hist_frame(data,
               column=None,
               by=None,
               grid=True,
               xlabelsize=None,
               xrot=None,
               ylabelsize=None,
               yrot=None,
               ax=None,
               sharex=False,
               sharey=False,
               figsize=None,
               layout=None,
               bins=10,
               **kwds):
    converter._WARN = False  # no warning for pandas plots
    if by is not None:
        axes = _grouped_hist(data,
                             column=column,
                             by=by,
                             ax=ax,
                             grid=grid,
                             figsize=figsize,
                             sharex=sharex,
                             sharey=sharey,
                             layout=layout,
                             bins=bins,
                             xlabelsize=xlabelsize,
                             xrot=xrot,
                             ylabelsize=ylabelsize,
                             yrot=yrot,
                             **kwds)
        return axes

    if column is not None:
        if not isinstance(column, (list, np.ndarray, ABCIndexClass)):
            column = [column]
        data = data[column]
    data = data._get_numeric_data()
    naxes = len(data.columns)

    if naxes == 0:
        raise ValueError("hist method requires numerical columns, "
                         "nothing to plot.")

    fig, axes = _subplots(naxes=naxes,
                          ax=ax,
                          squeeze=False,
                          sharex=sharex,
                          sharey=sharey,
                          figsize=figsize,
                          layout=layout)
    _axes = _flatten(axes)

    for i, col in enumerate(com.try_sort(data.columns)):
        ax = _axes[i]
        ax.hist(data[col].dropna().values, bins=bins, **kwds)
        ax.set_title(col)
        ax.grid(grid)

    _set_ticks_props(axes,
                     xlabelsize=xlabelsize,
                     xrot=xrot,
                     ylabelsize=ylabelsize,
                     yrot=yrot)
    fig.subplots_adjust(wspace=0.3, hspace=0.3)

    return axes
Beispiel #10
0
def hist_frame(
    data,
    column=None,
    by=None,
    grid=True,
    xlabelsize=None,
    xrot=None,
    ylabelsize=None,
    yrot=None,
    ax=None,
    sharex=False,
    sharey=False,
    figsize=None,
    layout=None,
    bins=10,
    legend: bool = False,
    **kwds,
):
    if legend and "label" in kwds:
        raise ValueError("Cannot use both legend and label")
    if by is not None:
        axes = _grouped_hist(
            data,
            column=column,
            by=by,
            ax=ax,
            grid=grid,
            figsize=figsize,
            sharex=sharex,
            sharey=sharey,
            layout=layout,
            bins=bins,
            xlabelsize=xlabelsize,
            xrot=xrot,
            ylabelsize=ylabelsize,
            yrot=yrot,
            legend=legend,
            **kwds,
        )
        return axes

    if column is not None:
        if not isinstance(column, (list, np.ndarray, ABCIndexClass)):
            column = [column]
        data = data[column]
    data = data._get_numeric_data()
    naxes = len(data.columns)

    if naxes == 0:
        raise ValueError(
            "hist method requires numerical columns, nothing to plot.")

    fig, axes = _subplots(
        naxes=naxes,
        ax=ax,
        squeeze=False,
        sharex=sharex,
        sharey=sharey,
        figsize=figsize,
        layout=layout,
    )
    _axes = _flatten(axes)

    can_set_label = "label" not in kwds

    for i, col in enumerate(data.columns):
        ax = _axes[i]
        if legend and can_set_label:
            kwds["label"] = col
        ax.hist(data[col].dropna().values, bins=bins, **kwds)
        ax.set_title(col)
        ax.grid(grid)
        if legend:
            ax.legend()

    _set_ticks_props(axes,
                     xlabelsize=xlabelsize,
                     xrot=xrot,
                     ylabelsize=ylabelsize,
                     yrot=yrot)
    fig.subplots_adjust(wspace=0.3, hspace=0.3)

    return axes
Beispiel #11
0
def scatter_matrix(frame, alpha=0.5, figsize=None, ax=None, grid=False,
                   diagonal='hist', marker='.', density_kwds=None,
                   hist_kwds=None, range_padding=0.05, **kwds):
    df = frame._get_numeric_data()
    n = df.columns.size
    naxes = n * n
    fig, axes = _subplots(naxes=naxes, figsize=figsize, ax=ax,
                          squeeze=False)

    # no gaps between subplots
    fig.subplots_adjust(wspace=0, hspace=0)

    mask = notna(df)

    marker = _get_marker_compat(marker)

    hist_kwds = hist_kwds or {}
    density_kwds = density_kwds or {}

    # GH 14855
    kwds.setdefault('edgecolors', 'none')

    boundaries_list = []
    for a in df.columns:
        values = df[a].values[mask[a].values]
        rmin_, rmax_ = np.min(values), np.max(values)
        rdelta_ext = (rmax_ - rmin_) * range_padding / 2.
        boundaries_list.append((rmin_ - rdelta_ext, rmax_ + rdelta_ext))

    for i, a in enumerate(df.columns):
        for j, b in enumerate(df.columns):
            ax = axes[i, j]

            if i == j:
                values = df[a].values[mask[a].values]

                # Deal with the diagonal by drawing a histogram there.
                if diagonal == 'hist':
                    ax.hist(values, **hist_kwds)

                elif diagonal in ('kde', 'density'):
                    from scipy.stats import gaussian_kde
                    y = values
                    gkde = gaussian_kde(y)
                    ind = np.linspace(y.min(), y.max(), 1000)
                    ax.plot(ind, gkde.evaluate(ind), **density_kwds)

                ax.set_xlim(boundaries_list[i])

            else:
                common = (mask[a] & mask[b]).values

                ax.scatter(df[b][common], df[a][common],
                           marker=marker, alpha=alpha, **kwds)

                ax.set_xlim(boundaries_list[j])
                ax.set_ylim(boundaries_list[i])

            ax.set_xlabel(b)
            ax.set_ylabel(a)

            if j != 0:
                ax.yaxis.set_visible(False)
            if i != n - 1:
                ax.xaxis.set_visible(False)

    if len(df.columns) > 1:
        lim1 = boundaries_list[0]
        locs = axes[0][1].yaxis.get_majorticklocs()
        locs = locs[(lim1[0] <= locs) & (locs <= lim1[1])]
        adj = (locs - lim1[0]) / (lim1[1] - lim1[0])

        lim0 = axes[0][0].get_ylim()
        adj = adj * (lim0[1] - lim0[0]) + lim0[0]
        axes[0][0].yaxis.set_ticks(adj)

        if np.all(locs == locs.astype(int)):
            # if all ticks are int
            locs = locs.astype(int)
        axes[0][0].yaxis.set_ticklabels(locs)

    _set_ticks_props(axes, xlabelsize=8, xrot=90, ylabelsize=8, yrot=0)

    return axes
Beispiel #12
0
def scatter_matrix(
        frame,
        alpha=0.5,
        figsize=None,
        ax=None,
        grid=False,
        diagonal="hist",
        marker=".",
        density_kwds=None,
        hist_kwds=None,
        range_padding=0.05,
        plot_axes="all",  # "all", "lower", "upper"
        ylabel_direction=None,  # "left", "right",
        refresh_labels=False,
        scales=None,  # list of scale ["linear" or "log"]
        **kwds):
    '''
    Modification of pandas.scatter_matrix.
    '''

    if scales is None: scales = ["linear"] * len(frame.columns)

    def _get_marker_compat(marker):
        if marker not in mlines.lineMarkers:
            return "o"
        return marker

    df = frame._get_numeric_data()
    n = df.columns.size
    naxes = n * n
    fig, axes = _subplots(naxes=naxes, figsize=figsize, ax=ax, squeeze=False)

    if axes.ndim != 2:  # Note: axに二次元配列入れてても_subplotsは一次元配列を生成する
        axes = axes.reshape((n, n))

    ylabel_direction = ylabel_direction or (
        "left" if plot_axes != "upper" else "right"
    )  # default: "lower" --> "left", "upper"  --> "right"

    # no gaps between subplots
    fig.subplots_adjust(wspace=0, hspace=0)

    mask = notna(df)

    marker = _get_marker_compat(marker)

    hist_kwds = hist_kwds or {}
    density_kwds = density_kwds or {}

    # GH 14855
    kwds.setdefault("edgecolors", "none")

    boundaries_list = []
    for i, a in enumerate(df.columns):
        values = df[a].values[mask[a].values]
        rmin_, rmax_ = np.min(values), np.max(values)
        if scales[i] == "linear":
            rdelta_ext = (rmax_ - rmin_) * range_padding / 2.0
            boundaries_list.append((rmin_ - rdelta_ext, rmax_ + rdelta_ext))
        elif scales[i] == "log":
            rdelta_ext = np.exp(
                (np.log(rmax_) - np.log(rmin_)) * range_padding / 2.0)
            boundaries_list.append((rmin_ / rdelta_ext, rmax_ * rdelta_ext))

    for i, a in enumerate(df.columns):
        for j, b in enumerate(df.columns):
            ax = axes[i, j]
            ax.set_visible(False)  # 一旦非表示にしておく

            if i == j:
                values = df[a].values[mask[a].values]

                # Deal with the diagonal by drawing a histogram there.
                if diagonal == "hist":
                    if "bins" not in hist_kwds: hist_kwds["bins"] = 16
                    _hist_kwds = {key: val for key, val in hist_kwds.items()}
                    if scales[i] == "log" and type(hist_kwds["bins"]) is int:
                        _hist_kwds["bins"] = np.logspace(
                            np.log10(np.min(values)), np.log10(np.max(values)),
                            hist_kwds["bins"])
                    ax.hist(values, **_hist_kwds)

                elif diagonal in ("kde", "density"):
                    from scipy.stats import gaussian_kde

                    y = values
                    gkde = gaussian_kde(y)
                    ind = np.linspace(y.min(), y.max(), 1000)
                    ax.plot(ind, gkde.evaluate(ind), **density_kwds)

                ax.set_xscale(scales[i])
                ax.set_xlim(boundaries_list[i])
                ax.set_visible(True)

            elif plot_axes == "all" or (i > j and plot_axes == "lower") or (
                    i < j and plot_axes == "upper"):
                common = (mask[a] & mask[b]).values

                ax.scatter(df[b][common],
                           df[a][common],
                           marker=marker,
                           alpha=alpha,
                           **kwds)

                ax.set_xscale(scales[j])
                ax.set_yscale(scales[i])

                ax.set_xlim(boundaries_list[j])
                ax.set_ylim(boundaries_list[i])
                ax.set_visible(True)

            ax.set_xlabel(b)
            ax.set_ylabel(a)

            if refresh_labels:
                ax.xaxis.set_visible(True)
                ax.yaxis.set_visible(True)

            if plot_axes in ("all", "lower"):
                if j != 0:
                    ax.yaxis.set_visible(False)
                if i != n - 1:
                    ax.xaxis.set_visible(False)
            elif plot_axes == "upper":
                if ylabel_direction == "left" and i != j:
                    ax.yaxis.set_visible(False)
                else:  # elif ylabel_direction == "right":
                    if j == n - 1:
                        ax.yaxis.tick_right()
                        ax.yaxis.set_label_position('right')
                    else:
                        ax.yaxis.set_visible(False)
                if i == 0:
                    ax.xaxis.tick_top()
                    ax.xaxis.set_label_position('top')
                else:
                    ax.xaxis.set_visible(False)

    if len(df.columns) > 1:
        lim1 = boundaries_list[0]
        locs = axes[0][1].yaxis.get_majorticklocs()
        locs = locs[(lim1[0] <= locs) & (locs <= lim1[1])]
        adj = (locs - lim1[0]) / (lim1[1] - lim1[0])

        lim0 = axes[0][0].get_ylim()
        adj = adj * (lim0[1] - lim0[0]) + lim0[0]
        axes[0][0].yaxis.set_ticks(adj)

        if np.all(locs == locs.astype(int)):
            # if all ticks are int
            locs = locs.astype(int)
        axes[0][0].yaxis.set_ticklabels(locs)

    _set_ticks_props(axes, xlabelsize=8, xrot=90, ylabelsize=8, yrot=0)

    return axes