def test_ci_to_errsize(): """Test behavior of ci_to_errsize.""" cis = [[.5, .5], [1.25, 1.5]] heights = [1, 1.5] actual_errsize = np.array([[.5, 1], [.25, 0]]) test_errsize = utils.ci_to_errsize(cis, heights) assert_array_equal(actual_errsize, test_errsize)
def lmplot(x, y, data, color=None, row=None, col=None, x_estimator=None, x_ci=95, fit_line=True, ci=95, truncate=False, sharex=True, sharey=True, palette="hls", size=None, scatter_kws=None, line_kws=None, palette_kws=None): """Plot a linear model from a DataFrame. Parameters ---------- x, y : strings column names in `data` DataFrame for x and y variables data : DataFrame source of data for the model color : string, optional DataFrame column name to group the model by color row, col : strings, optional DataFrame column names to make separate plot facets x_estimator : callable, optional Interpret X values as factor labels and use this function to plot the point estimate and bootstrapped CI x_ci : int optional size of confidence interval for x_estimator error bars fit_line : bool, optional if True fit a regression line by color/row/col and plot ci : int, optional confidence interval for the regression line truncate : bool, optional if True, only fit line from data min to data max sharex, sharey : bools, optional only relevant if faceting; passed to plt.subplots palette : seaborn color palette argument if using separate plots by color, draw with this color palette size : float, optional size (plots are square) for each plot facet {scatter, line}_kws : dictionary keyword arguments to pass to the underlying plot functions palette_kws : dictionary keyword arguments for seaborn.color_palette """ # TODO # - position_{dodge, jitter} # - legend when fit_line is False # - truncate fit # - wrap title when wide # - wrap columns # First sort out the general figure layout if size is None: size = mpl.rcParams["figure.figsize"][1] nrow = 1 if row is None else len(data[row].unique()) ncol = 1 if col is None else len(data[col].unique()) f, axes = plt.subplots(nrow, ncol, sharex=sharex, sharey=sharey, figsize=(size * ncol, size * nrow)) axes = np.atleast_2d(axes).reshape(nrow, ncol) if nrow == 1: row_masks = [np.repeat(True, len(data))] else: row_vals = np.sort(data[row].unique()) row_masks = [data[row] == val for val in row_vals] if ncol == 1: col_masks = [np.repeat(True, len(data))] else: col_vals = np.sort(data[col].unique()) col_masks = [data[col] == val for val in col_vals] if palette_kws is None: palette_kws = {} # Sort out the plot colors color_factor = color if color is None: hue_masks = [np.repeat(True, len(data))] colors = ["#222222"] else: hue_vals = np.sort(data[color].unique()) hue_masks = [data[color] == val for val in hue_vals] colors = color_palette(palette, len(hue_masks), **palette_kws) # Default keyword arguments for plot components if scatter_kws is None: scatter_kws = {} if line_kws is None: line_kws = {} # First walk through the facets and plot the scatters for row_i, row_mask in enumerate(row_masks): for col_j, col_mask in enumerate(col_masks): ax = axes[row_i, col_j] if not sharex or (row_i + 1 == len(row_masks)): ax.set_xlabel(x) if not sharey or col_j == 0: ax.set_ylabel(y) # Title the plot if we are faceting title = "" if row is not None: title += "%s = %s" % (row, row_vals[row_i]) if row is not None and col is not None: title += " | " if col is not None: title += "%s = %s" % (col, col_vals[col_j]) ax.set_title(title) for hue_k, hue_mask in enumerate(hue_masks): color = colors[hue_k] data_ijk = data[row_mask & col_mask & hue_mask] if x_estimator is not None: ms = scatter_kws.pop("ms", 7) mew = scatter_kws.pop("mew", 0) x_vals = data_ijk[x].unique() y_grouped = [ np.array(data_ijk[y][data_ijk[x] == v]) for v in x_vals ] y_est = [x_estimator(y_i) for y_i in y_grouped] y_boots = [ moss.bootstrap(np.array(y_i), func=x_estimator) for y_i in y_grouped ] ci_lims = [50 - x_ci / 2., 50 + x_ci / 2.] y_ci = [moss.percentiles(y_i, ci_lims) for y_i in y_boots] y_error = ci_to_errsize(np.transpose(y_ci), y_est) ax.plot(x_vals, y_est, "o", mew=mew, ms=ms, color=color, **scatter_kws) ax.errorbar(x_vals, y_est, y_error, fmt=None, ecolor=color) else: ms = scatter_kws.pop("ms", 4) mew = scatter_kws.pop("mew", 0) ax.plot(data_ijk[x], data_ijk[y], "o", color=color, mew=mew, ms=ms, **scatter_kws) for ax_i in np.ravel(axes): ax_i.set_xmargin(.05) ax_i.autoscale_view() # Now walk through again and plot the regression estimate # and a confidence interval for the regression line if fit_line: for row_i, row_mask in enumerate(row_masks): for col_j, col_mask in enumerate(col_masks): ax = axes[row_i, col_j] xlim = ax.get_xlim() for hue_k, hue_mask in enumerate(hue_masks): color = colors[hue_k] data_ijk = data[row_mask & col_mask & hue_mask] x_vals = np.array(data_ijk[x]) y_vals = np.array(data_ijk[y]) # Sort out the limit of the fit if truncate: xx = np.linspace(x_vals.min(), x_vals.max(), 100) else: xx = np.linspace(xlim[0], xlim[1], 100) # Inner function to bootstrap the regression def _bootstrap_reg(x, y): fit = np.polyfit(x, y, 1) return np.polyval(fit, xx) # Regression line confidence interval if ci is not None: ci_lims = [50 - ci / 2., 50 + ci / 2.] boots = moss.bootstrap(x_vals, y_vals, func=_bootstrap_reg) ci_band = moss.percentiles(boots, ci_lims, axis=0) ax.fill_between(xx, *ci_band, color=color, alpha=.15) fit = np.polyfit(x_vals, y_vals, 1) reg = np.polyval(fit, xx) if color_factor is None: label = "" else: label = hue_vals[hue_k] ax.plot(xx, reg, color=color, label=str(label), **line_kws) ax.set_xlim(xlim) # Plot the legend on the upper left facet and adjust the layout if color_factor is not None: axes[0, 0].legend(loc="best", title=color_factor) plt.tight_layout()
def _plot_ci_bars(ax, x, central_data, ci, color, **kwargs): """Plot error bars at each data point.""" err = ci_to_errsize(ci, central_data) ax.errorbar(x, central_data, yerr=err, fmt=None, ecolor=color)
def plot(self, ax, kws): """Draw the plot onto an axes, passing matplotlib kwargs.""" # Draw a test plot, using the passed in kwargs. The goal here is to # honor both (a) the current state of the plot cycler and (b) the # specified kwargs on all the lines we will draw, overriding when # relevant with the data semantics. Note that we won't cycle # internally; in other words, if ``hue`` is not used, all elements will # have the same color, but they will have the color that you would have # gotten from the corresponding matplotlib function, and calling the # function will advance the axes property cycle. scout, = ax.plot([], [], **kws) orig_color = kws.pop("color", scout.get_color()) orig_marker = kws.pop("marker", scout.get_marker()) orig_linewidth = kws.pop("linewidth", kws.pop("lw", scout.get_linewidth())) orig_dashes = kws.pop("dashes", "") kws.setdefault("markeredgewidth", kws.pop("mew", .75)) kws.setdefault("markeredgecolor", kws.pop("mec", "w")) scout.remove() # Set default error kwargs err_kws = self.err_kws.copy() if self.err_style == "band": err_kws.setdefault("alpha", .2) elif self.err_style == "bars": pass elif self.err_style is not None: err = "`err_style` must be 'band' or 'bars', not {}" raise ValueError(err.format(self.err_style)) # Loop over the semantic subsets and draw a line for each for semantics, data in self.subset_data(): hue, size, style = semantics x, y, units = data["x"], data["y"], data.get("units", None) if self.estimator is not None: if self.units is not None: err = "estimator must be None when specifying units" raise ValueError(err) x, y, y_ci = self.aggregate(y, x, units) else: y_ci = None kws["color"] = self.palette.get(hue, orig_color) kws["dashes"] = self.dashes.get(style, orig_dashes) kws["marker"] = self.markers.get(style, orig_marker) kws["linewidth"] = self.sizes.get(size, orig_linewidth) line, = ax.plot([], [], **kws) line_color = line.get_color() line_alpha = line.get_alpha() line_capstyle = line.get_solid_capstyle() line.remove() # --- Draw the main line x, y = np.asarray(x), np.asarray(y) if self.units is None: line, = ax.plot(x, y, **kws) else: for u in units.unique(): rows = np.asarray(units == u) ax.plot(x[rows], y[rows], **kws) # --- Draw the confidence intervals if y_ci is not None: low, high = np.asarray(y_ci["low"]), np.asarray(y_ci["high"]) if self.err_style == "band": ax.fill_between(x, low, high, color=line_color, **err_kws) elif self.err_style == "bars": y_err = ci_to_errsize((low, high), y) ebars = ax.errorbar(x, y, y_err, linestyle="", color=line_color, alpha=line_alpha, **err_kws) # Set the capstyle properly on the error bars for obj in ebars.get_children(): try: obj.set_capstyle(line_capstyle) except AttributeError: # Does not exist on mpl < 2.2 pass # Finalize the axes details self.label_axes(ax) if self.legend: self.add_legend_data(ax) handles, _ = ax.get_legend_handles_labels() if handles: ax.legend()
def lmplot(x, y, data, color=None, row=None, col=None, col_wrap=None, x_estimator=None, x_ci=95, n_boot=5000, fit_reg=True, order=1, ci=95, logistic=False, truncate=False, x_partial=None, y_partial=None, x_jitter=None, y_jitter=None, sharex=True, sharey=True, palette="husl", size=None, scatter_kws=None, line_kws=None, palette_kws=None): """Plot a linear model from a DataFrame. Parameters ---------- x, y : strings column names in `data` DataFrame for x and y variables data : DataFrame source of data for the model color : string, optional DataFrame column name to group the model by color row, col : strings, optional DataFrame column names to make separate plot facets col_wrap : int, optional wrap col variable at this width - cannot be used with row facet x_estimator : callable, optional Interpret X values as factor labels and use this function to plot the point estimate and bootstrapped CI x_ci : int optional size of confidence interval for x_estimator error bars n_boot : int, optional number of bootstrap iterations to perform fit_reg : bool, optional if True fit a regression model by color/row/col and plot order : int, optional order of the regression polynomial to fit (default = 1) ci : int, optional confidence interval for the regression line logistic : bool, optional fit the regression line with logistic regression truncate : bool, optional if True, only fit line from data min to data max {x, y}_partial : string or list of strings, optional regress these variables out of the factors before plotting {x, y}_jitter : float, optional parameters for uniformly distributed random noise added to positions sharex, sharey : bools, optional only relevant if faceting; passed to plt.subplots palette : seaborn color palette argument if using separate plots by color, draw with this color palette size : float, optional size (plots are square) for each plot facet {scatter, line}_kws : dictionary keyword arguments to pass to the underlying plot functions palette_kws : dictionary keyword arguments for seaborn.color_palette """ # TODO # - legend when fit_line is False # - wrap title when wide # First sort out the general figure layout if size is None: size = mpl.rcParams["figure.figsize"][1] if col is None and col_wrap is not None: raise ValueError("Need column facet variable for `col_wrap`") if row is not None and col_wrap is not None: raise ValueError("Cannot facet rows when using `col_wrap`") nrow = 1 if row is None else len(data[row].unique()) ncol = 1 if col is None else len(data[col].unique()) if col_wrap is not None: ncol = col_wrap nrow = int(np.ceil(len(data[col].unique()) / col_wrap)) f, axes = plt.subplots(nrow, ncol, sharex=sharex, sharey=sharey, figsize=(size * ncol, size * nrow)) axes = np.atleast_2d(axes).reshape(nrow, ncol) if nrow == 1 or col_wrap is not None: row_masks = [np.repeat(True, len(data))] else: row_vals = np.sort(data[row].unique()) row_masks = [data[row] == val for val in row_vals] if ncol == 1: col_masks = [np.repeat(True, len(data))] else: col_vals = np.sort(data[col].unique()) col_masks = [data[col] == val for val in col_vals] if x_partial is not None: if not isinstance(x_partial, list): x_partial = [x_partial] if y_partial is not None: if not isinstance(y_partial, list): y_partial = [y_partial] if palette_kws is None: palette_kws = {} # Sort out the plot colors color_factor = color if color is None: hue_masks = [np.repeat(True, len(data))] colors = ["#222222"] else: hue_vals = np.sort(data[color].unique()) hue_masks = [data[color] == val for val in hue_vals] colors = color_palette(palette, len(hue_masks), **palette_kws) # Default keyword arguments for plot components if scatter_kws is None: scatter_kws = {} if line_kws is None: line_kws = {} # First walk through the facets and plot the scatters scatter_ms = scatter_kws.pop("ms", 4) scatter_mew = mew = scatter_kws.pop("mew", 0) scatter_alpha = mew = scatter_kws.pop("alpha", .77) for row_i, row_mask in enumerate(row_masks): for col_j, col_mask in enumerate(col_masks): if col_wrap is not None: f_row = col_j // ncol f_col = col_j % ncol else: f_row, f_col = row_i, col_j ax = axes[f_row, f_col] if f_row + 1 == nrow: ax.set_xlabel(x) if f_col == 0: ax.set_ylabel(y) # Title the plot if we are faceting title = "" if row is not None: title += "%s = %s" % (row, row_vals[row_i]) if row is not None and col is not None: title += " | " if col is not None: title += "%s = %s" % (col, col_vals[col_j]) ax.set_title(title) for hue_k, hue_mask in enumerate(hue_masks): color = colors[hue_k] data_ijk = data[row_mask & col_mask & hue_mask] if x_estimator is not None: ms = scatter_kws.pop("ms", 7) mew = scatter_kws.pop("mew", 0) x_vals = data_ijk[x].unique() y_vals = data_ijk[y] if y_partial is not None: for var in y_partial: conf = data_ijk[var] conf -= conf.mean() y_mean = y_vals.mean() y_vals = moss.vector_reject(y_vals - y_mean, conf) y_vals += y_mean y_grouped = [np.array(y_vals[data_ijk[x] == v]) for v in x_vals] y_est = [x_estimator(y_i) for y_i in y_grouped] y_boots = [moss.bootstrap(np.array(y_i), func=x_estimator, n_boot=n_boot) for y_i in y_grouped] ci_lims = [50 - x_ci / 2., 50 + x_ci / 2.] y_ci = [moss.percentiles(y_i, ci_lims) for y_i in y_boots] y_error = ci_to_errsize(np.transpose(y_ci), y_est) ax.plot(x_vals, y_est, "o", mew=mew, ms=ms, color=color, **scatter_kws) ax.errorbar(x_vals, y_est, y_error, fmt=None, ecolor=color) else: x_ = data_ijk[x] y_ = data_ijk[y] if x_partial is not None: for var in x_partial: conf = data_ijk[var] conf -= conf.mean() x_mean = x_.mean() x_ = moss.vector_reject(x_ - x_mean, conf) x_ += x_mean if y_partial is not None: for var in y_partial: conf = data_ijk[var] conf -= conf.mean() y_mean = y_.mean() y_ = moss.vector_reject(y_ - y_mean, conf) y_ += y_mean if x_jitter is not None: x_ += np.random.uniform(-x_jitter, x_jitter, x_.shape) if y_jitter is not None: y_ += np.random.uniform(-y_jitter, y_jitter, y_.shape) ax.plot(x_, y_, "o", color=color, alpha=scatter_alpha, mew=scatter_mew, ms=scatter_ms, **scatter_kws) for ax_i in np.ravel(axes): ax_i.set_xmargin(.05) ax_i.autoscale_view() # Now walk through again and plot the regression estimate # and a confidence interval for the regression line if fit_reg: for row_i, row_mask in enumerate(row_masks): for col_j, col_mask in enumerate(col_masks): if col_wrap is not None: f_row = col_j // ncol f_col = col_j % ncol else: f_row, f_col = row_i, col_j ax = axes[f_row, f_col] xlim = ax.get_xlim() for hue_k, hue_mask in enumerate(hue_masks): color = colors[hue_k] data_ijk = data[row_mask & col_mask & hue_mask] x_vals = np.array(data_ijk[x]) y_vals = np.array(data_ijk[y]) if not len(x_vals): continue # Sort out the limit of the fit if truncate: xx = np.linspace(x_vals.min(), x_vals.max(), 100) else: xx = np.linspace(xlim[0], xlim[1], 100) xx_ = sm.add_constant(xx, prepend=True) # Inner function to bootstrap the regression def _regress(x, y): if logistic: x_ = sm.add_constant(x, prepend=True) fit = sm.GLM(y, x_, family=sm.families.Binomial()).fit() reg = fit.predict(xx_) else: fit = np.polyfit(x, y, order) reg = np.polyval(fit, xx) return reg # Remove nuisance variables with vector rejection if x_partial is not None: for var in x_partial: conf = data_ijk[var] conf -= conf.mean() x_mean = x_vals.mean() x_vals = moss.vector_reject(x_vals - x_mean, conf) x_vals += x_mean if y_partial is not None: for var in y_partial: conf = data_ijk[var] conf -= conf.mean() y_mean = y_vals.mean() y_vals = moss.vector_reject(y_vals - y_mean, conf) y_vals += y_mean # Regression line confidence interval if ci is not None: ci_lims = [50 - ci / 2., 50 + ci / 2.] boots = moss.bootstrap(x_vals, y_vals, func=_regress, n_boot=n_boot) ci_band = moss.percentiles(boots, ci_lims, axis=0) ax.fill_between(xx, *ci_band, color=color, alpha=.15) # Regression line reg = _regress(x_vals, y_vals) if color_factor is None: label = "" else: label = hue_vals[hue_k] ax.plot(xx, reg, color=color, label=str(label), **line_kws) ax.set_xlim(xlim) # Plot the legend on the upper left facet and adjust the layout if color_factor is not None and color_factor not in [row, col]: axes[0, 0].legend(loc="best", title=color_factor) plt.tight_layout()
def lmplot(x, y, data, color=None, row=None, col=None, col_wrap=None, x_estimator=None, x_ci=95, x_bins=None, n_boot=5000, fit_reg=True, order=1, ci=95, logistic=False, truncate=False, x_partial=None, y_partial=None, x_jitter=None, y_jitter=None, sharex=True, sharey=True, palette="husl", size=None, scatter_kws=None, line_kws=None, palette_kws=None): """Plot a linear model with faceting, color binning, and other options. Parameters ---------- x, y : strings Column names in `data` DataFrame for x and y variables. data : DataFrame Dource of data for the model. color : string, optional DataFrame column name to group the model by color. row, col : strings, optional DataFrame column names to make separate plot facets. col_wrap : int, optional Wrap col variable at this width - cannot be used with row facet. x_estimator : callable, optional Interpret X values as factor labels and use this function to plot the point estimate and bootstrapped CI. x_ci : int optional Size of confidence interval for x_estimator error bars. x_bins : sequence of floats, optional Bin the x variable with these values. Implies that x_estimator is mean, unless otherwise provided. n_boot : int, optional Number of bootstrap iterations to perform. fit_reg : bool, optional If True fit a regression model by color/row/col and plot. order : int, optional Order of the regression polynomial to fit. ci : int, optional Confidence interval for the regression line. logistic : bool, optional Fit the regression line with logistic regression. truncate : bool, optional If True, only fit line from data min to data max. {x, y}_partial : string or list of strings, optional Regress these variables out of the factors before plotting. {x, y}_jitter : float, optional Parameters for uniformly distributed random noise added to positions. sharex, sharey : bools, optional Only relevant if faceting; passed to plt.subplots. palette : seaborn color palette argument If using separate plots by color, draw with this color palette. size : float, optional Size (plots are square) for each plot facet. {scatter, line}_kws : dictionary Keyword arguments to pass to the underlying plot functions. palette_kws : dictionary Keyword arguments for seaborn.color_palette. """ # TODO # - legend when fit_line is False # First sort out the general figure layout if size is None: size = mpl.rcParams["figure.figsize"][1] if col is None and col_wrap is not None: raise ValueError("Need column facet variable for `col_wrap`") if row is not None and col_wrap is not None: raise ValueError("Cannot facet rows when using `col_wrap`") nrow = 1 if row is None else len(data[row].unique()) ncol = 1 if col is None else len(data[col].unique()) if col_wrap is not None: ncol = col_wrap nrow = int(np.ceil(len(data[col].unique()) / col_wrap)) f, axes = plt.subplots(nrow, ncol, sharex=sharex, sharey=sharey, figsize=(size * ncol, size * nrow)) axes = np.atleast_2d(axes).reshape(nrow, ncol) if nrow == 1 or col_wrap is not None: row_masks = [np.repeat(True, len(data))] else: row_vals = np.sort(data[row].unique()) row_masks = [data[row] == val for val in row_vals] if ncol == 1: col_masks = [np.repeat(True, len(data))] else: col_vals = np.sort(data[col].unique()) col_masks = [data[col] == val for val in col_vals] if x_bins is not None: x_estimator = np.mean if x_estimator is None else x_estimator x_bins = np.c_[x_bins] if x_partial is not None: if not isinstance(x_partial, list): x_partial = [x_partial] if y_partial is not None: if not isinstance(y_partial, list): y_partial = [y_partial] if palette_kws is None: palette_kws = {} # Sort out the plot colors color_factor = color if color is None: hue_masks = [np.repeat(True, len(data))] colors = ["#222222"] else: hue_vals = np.sort(data[color].unique()) hue_masks = [data[color] == val for val in hue_vals] colors = color_palette(palette, len(hue_masks), **palette_kws) # Default keyword arguments for plot components if scatter_kws is None: scatter_kws = {} if line_kws is None: line_kws = {} # First walk through the facets and plot the scatters scatter_ms = scatter_kws.pop("ms", 4) scatter_mew = mew = scatter_kws.pop("mew", 0) scatter_alpha = mew = scatter_kws.pop("alpha", .77) for row_i, row_mask in enumerate(row_masks): for col_j, col_mask in enumerate(col_masks): if col_wrap is not None: f_row = col_j // ncol f_col = col_j % ncol else: f_row, f_col = row_i, col_j ax = axes[f_row, f_col] if f_row + 1 == nrow: ax.set_xlabel(x) if f_col == 0: ax.set_ylabel(y) # Title the plot if we are faceting title = "" if row is not None: title += "%s = %s" % (row, row_vals[row_i]) if row is not None and col is not None: title += " | " if col is not None: title += "%s = %s" % (col, col_vals[col_j]) if size < 3: title = title.replace(" | ", "\n") ax.set_title(title) for hue_k, hue_mask in enumerate(hue_masks): color = colors[hue_k] data_ijk = data[row_mask & col_mask & hue_mask] if x_estimator is not None: ms = scatter_kws.pop("ms", 7) mew = scatter_kws.pop("mew", 0) if x_bins is None: x_vals = data_ijk[x].unique() x_data = data_ijk[x] else: dist = distance.cdist(np.c_[data_ijk[x]], x_bins) x_vals = x_bins.ravel() x_data = x_bins[np.argmin(dist, axis=1)].ravel() y_vals = data_ijk[y] if y_partial is not None: for var in y_partial: conf = data_ijk[var] conf -= conf.mean() y_mean = y_vals.mean() y_vals = moss.vector_reject(y_vals - y_mean, conf) y_vals += y_mean y_grouped = [np.array(y_vals[x_data == v]) for v in x_vals] y_est = [x_estimator(y_i) for y_i in y_grouped] y_boots = [moss.bootstrap(np.array(y_i), func=x_estimator, n_boot=n_boot) for y_i in y_grouped] ci_lims = [50 - x_ci / 2., 50 + x_ci / 2.] y_ci = [moss.percentiles(y_i, ci_lims) for y_i in y_boots] y_error = ci_to_errsize(np.transpose(y_ci), y_est) ax.plot(x_vals, y_est, "o", mew=mew, ms=ms, color=color, **scatter_kws) ax.errorbar(x_vals, y_est, y_error, fmt=None, ecolor=color) else: x_ = data_ijk[x] y_ = data_ijk[y] if x_partial is not None: for var in x_partial: conf = data_ijk[var] conf -= conf.mean() x_mean = x_.mean() x_ = moss.vector_reject(x_ - x_mean, conf) x_ += x_mean if y_partial is not None: for var in y_partial: conf = data_ijk[var] conf -= conf.mean() y_mean = y_.mean() y_ = moss.vector_reject(y_ - y_mean, conf) y_ += y_mean if x_jitter is not None: x_ += np.random.uniform(-x_jitter, x_jitter, x_.shape) if y_jitter is not None: y_ += np.random.uniform(-y_jitter, y_jitter, y_.shape) ax.plot(x_, y_, "o", color=color, alpha=scatter_alpha, mew=scatter_mew, ms=scatter_ms, **scatter_kws) for ax_i in np.ravel(axes): ax_i.set_xmargin(.05) ax_i.autoscale_view() # Now walk through again and plot the regression estimate # and a confidence interval for the regression line if fit_reg: for row_i, row_mask in enumerate(row_masks): for col_j, col_mask in enumerate(col_masks): if col_wrap is not None: f_row = col_j // ncol f_col = col_j % ncol else: f_row, f_col = row_i, col_j ax = axes[f_row, f_col] xlim = ax.get_xlim() for hue_k, hue_mask in enumerate(hue_masks): color = colors[hue_k] data_ijk = data[row_mask & col_mask & hue_mask] x_vals = np.array(data_ijk[x]) y_vals = np.array(data_ijk[y]) if not len(x_vals): continue # Sort out the limit of the fit if truncate: xx = np.linspace(x_vals.min(), x_vals.max(), 100) else: xx = np.linspace(xlim[0], xlim[1], 100) xx_ = sm.add_constant(xx, prepend=True) # Inner function to bootstrap the regression def _regress(x, y): if logistic: x_ = sm.add_constant(x, prepend=True) fit = sm.GLM(y, x_, family=sm.families.Binomial()).fit() reg = fit.predict(xx_) else: fit = np.polyfit(x, y, order) reg = np.polyval(fit, xx) return reg # Remove nuisance variables with vector rejection if x_partial is not None: for var in x_partial: conf = data_ijk[var] conf -= conf.mean() x_mean = x_vals.mean() x_vals = moss.vector_reject(x_vals - x_mean, conf) x_vals += x_mean if y_partial is not None: for var in y_partial: conf = data_ijk[var] conf -= conf.mean() y_mean = y_vals.mean() y_vals = moss.vector_reject(y_vals - y_mean, conf) y_vals += y_mean # Regression line confidence interval if ci is not None: ci_lims = [50 - ci / 2., 50 + ci / 2.] boots = moss.bootstrap(x_vals, y_vals, func=_regress, n_boot=n_boot) ci_band = moss.percentiles(boots, ci_lims, axis=0) ax.fill_between(xx, *ci_band, color=color, alpha=.15) # Regression line reg = _regress(x_vals, y_vals) if color_factor is None: label = "" else: label = hue_vals[hue_k] ax.plot(xx, reg, color=color, label=str(label), **line_kws) ax.set_xlim(xlim) # Plot the legend on the upper left facet and adjust the layout if color_factor is not None and color_factor not in [row, col]: axes[0, 0].legend(loc="best", title=color_factor) plt.tight_layout()
def _plot_ci_bars(ax, x, central_data, ci, color, err_kws, **kwargs): """Plot error bars at each data point.""" err = ci_to_errsize(ci, central_data) ax.errorbar(x, central_data, yerr=err, fmt=None, ecolor=color, label="_nolegend_", **err_kws)
def lmplot(x, y, data, color=None, row=None, col=None, x_estimator=None, x_ci=95, fit_line=True, ci=95, truncate=False, sharex=True, sharey=True, palette="hls", size=None, scatter_kws=None, line_kws=None, palette_kws=None): """Plot a linear model from a DataFrame. Parameters ---------- x, y : strings column names in `data` DataFrame for x and y variables data : DataFrame source of data for the model color : string, optional DataFrame column name to group the model by color row, col : strings, optional DataFrame column names to make separate plot facets x_estimator : callable, optional Interpret X values as factor labels and use this function to plot the point estimate and bootstrapped CI x_ci : int optional size of confidence interval for x_estimator error bars fit_line : bool, optional if True fit a regression line by color/row/col and plot ci : int, optional confidence interval for the regression line truncate : bool, optional if True, only fit line from data min to data max sharex, sharey : bools, optional only relevant if faceting; passed to plt.subplots palette : seaborn color palette argument if using separate plots by color, draw with this color palette size : float, optional size (plots are square) for each plot facet {scatter, line}_kws : dictionary keyword arguments to pass to the underlying plot functions palette_kws : dictionary keyword arguments for seaborn.color_palette """ # TODO # - position_{dodge, jitter} # - legend when fit_line is False # - truncate fit # - wrap title when wide # - wrap columns # First sort out the general figure layout if size is None: size = mpl.rcParams["figure.figsize"][1] nrow = 1 if row is None else len(data[row].unique()) ncol = 1 if col is None else len(data[col].unique()) f, axes = plt.subplots(nrow, ncol, sharex=sharex, sharey=sharey, figsize=(size * ncol, size * nrow)) axes = np.atleast_2d(axes).reshape(nrow, ncol) if nrow == 1: row_masks = [np.repeat(True, len(data))] else: row_vals = np.sort(data[row].unique()) row_masks = [data[row] == val for val in row_vals] if ncol == 1: col_masks = [np.repeat(True, len(data))] else: col_vals = np.sort(data[col].unique()) col_masks = [data[col] == val for val in col_vals] if palette_kws is None: palette_kws = {} # Sort out the plot colors color_factor = color if color is None: hue_masks = [np.repeat(True, len(data))] colors = ["#222222"] else: hue_vals = np.sort(data[color].unique()) hue_masks = [data[color] == val for val in hue_vals] colors = color_palette(palette, len(hue_masks), **palette_kws) # Default keyword arguments for plot components if scatter_kws is None: scatter_kws = {} if line_kws is None: line_kws = {} # First walk through the facets and plot the scatters for row_i, row_mask in enumerate(row_masks): for col_j, col_mask in enumerate(col_masks): ax = axes[row_i, col_j] if not sharex or (row_i + 1 == len(row_masks)): ax.set_xlabel(x) if not sharey or col_j == 0: ax.set_ylabel(y) # Title the plot if we are faceting title = "" if row is not None: title += "%s = %s" % (row, row_vals[row_i]) if row is not None and col is not None: title += " | " if col is not None: title += "%s = %s" % (col, col_vals[col_j]) ax.set_title(title) for hue_k, hue_mask in enumerate(hue_masks): color = colors[hue_k] data_ijk = data[row_mask & col_mask & hue_mask] if x_estimator is not None: ms = scatter_kws.pop("ms", 7) mew = scatter_kws.pop("mew", 0) x_vals = data_ijk[x].unique() y_grouped = [np.array(data_ijk[y][data_ijk[x] == v]) for v in x_vals] y_est = [x_estimator(y_i) for y_i in y_grouped] y_boots = [moss.bootstrap(np.array(y_i), func=x_estimator) for y_i in y_grouped] ci_lims = [50 - x_ci / 2., 50 + x_ci / 2.] y_ci = [moss.percentiles(y_i, ci_lims) for y_i in y_boots] y_error = ci_to_errsize(np.transpose(y_ci), y_est) ax.plot(x_vals, y_est, "o", mew=mew, ms=ms, color=color, **scatter_kws) ax.errorbar(x_vals, y_est, y_error, fmt=None, ecolor=color) else: ms = scatter_kws.pop("ms", 4) mew = scatter_kws.pop("mew", 0) ax.plot(data_ijk[x], data_ijk[y], "o", color=color, mew=mew, ms=ms, **scatter_kws) for ax_i in np.ravel(axes): ax_i.set_xmargin(.05) ax_i.autoscale_view() # Now walk through again and plot the regression estimate # and a confidence interval for the regression line if fit_line: for row_i, row_mask in enumerate(row_masks): for col_j, col_mask in enumerate(col_masks): ax = axes[row_i, col_j] xlim = ax.get_xlim() for hue_k, hue_mask in enumerate(hue_masks): color = colors[hue_k] data_ijk = data[row_mask & col_mask & hue_mask] x_vals = np.array(data_ijk[x]) y_vals = np.array(data_ijk[y]) # Sort out the limit of the fit if truncate: xx = np.linspace(x_vals.min(), x_vals.max(), 100) else: xx = np.linspace(xlim[0], xlim[1], 100) # Inner function to bootstrap the regression def _bootstrap_reg(x, y): fit = np.polyfit(x, y, 1) return np.polyval(fit, xx) # Regression line confidence interval if ci is not None: ci_lims = [50 - ci / 2., 50 + ci / 2.] boots = moss.bootstrap(x_vals, y_vals, func=_bootstrap_reg) ci_band = moss.percentiles(boots, ci_lims, axis=0) ax.fill_between(xx, *ci_band, color=color, alpha=.15) fit = np.polyfit(x_vals, y_vals, 1) reg = np.polyval(fit, xx) if color_factor is None: label = "" else: label = hue_vals[hue_k] ax.plot(xx, reg, color=color, label=str(label), **line_kws) ax.set_xlim(xlim) # Plot the legend on the upper left facet and adjust the layout if color_factor is not None: axes[0, 0].legend(loc="best", title=color_factor) plt.tight_layout()