Beispiel #1
0
def test_ci_to_errsize():
    """Test behavior of ci_to_errsize."""
    cis = [[.5, .5], [1.25, 1.5]]

    heights = [1, 1.5]

    actual_errsize = np.array([[.5, 1], [.25, 0]])

    test_errsize = utils.ci_to_errsize(cis, heights)
    assert_array_equal(actual_errsize, test_errsize)
Beispiel #2
0
def lmplot(x,
           y,
           data,
           color=None,
           row=None,
           col=None,
           x_estimator=None,
           x_ci=95,
           fit_line=True,
           ci=95,
           truncate=False,
           sharex=True,
           sharey=True,
           palette="hls",
           size=None,
           scatter_kws=None,
           line_kws=None,
           palette_kws=None):
    """Plot a linear model from a DataFrame.

    Parameters
    ----------
    x, y : strings
        column names in `data` DataFrame for x and y variables
    data : DataFrame
        source of data for the model
    color : string, optional
        DataFrame column name to group the model by color
    row, col : strings, optional
        DataFrame column names to make separate plot facets
    x_estimator : callable, optional
        Interpret X values as factor labels and use this function
        to plot the point estimate and bootstrapped CI
    x_ci : int optional
        size of confidence interval for x_estimator error bars
    fit_line : bool, optional
        if True fit a regression line by color/row/col and plot
    ci : int, optional
        confidence interval for the regression line
    truncate : bool, optional
        if True, only fit line from data min to data max
    sharex, sharey : bools, optional
        only relevant if faceting; passed to plt.subplots
    palette : seaborn color palette argument
        if using separate plots by color, draw with this color palette
    size : float, optional
        size (plots are square) for each plot facet
    {scatter, line}_kws : dictionary
        keyword arguments to pass to the underlying plot functions
    palette_kws : dictionary
        keyword arguments for seaborn.color_palette

    """
    # TODO
    # - position_{dodge, jitter}
    # - legend when fit_line is False
    # - truncate fit
    # - wrap title when wide
    # - wrap columns

    # First sort out the general figure layout
    if size is None:
        size = mpl.rcParams["figure.figsize"][1]

    nrow = 1 if row is None else len(data[row].unique())
    ncol = 1 if col is None else len(data[col].unique())

    f, axes = plt.subplots(nrow,
                           ncol,
                           sharex=sharex,
                           sharey=sharey,
                           figsize=(size * ncol, size * nrow))
    axes = np.atleast_2d(axes).reshape(nrow, ncol)

    if nrow == 1:
        row_masks = [np.repeat(True, len(data))]
    else:
        row_vals = np.sort(data[row].unique())
        row_masks = [data[row] == val for val in row_vals]

    if ncol == 1:
        col_masks = [np.repeat(True, len(data))]
    else:
        col_vals = np.sort(data[col].unique())
        col_masks = [data[col] == val for val in col_vals]

    if palette_kws is None:
        palette_kws = {}

    # Sort out the plot colors
    color_factor = color
    if color is None:
        hue_masks = [np.repeat(True, len(data))]
        colors = ["#222222"]
    else:
        hue_vals = np.sort(data[color].unique())
        hue_masks = [data[color] == val for val in hue_vals]
        colors = color_palette(palette, len(hue_masks), **palette_kws)

    # Default keyword arguments for plot components
    if scatter_kws is None:
        scatter_kws = {}
    if line_kws is None:
        line_kws = {}

    # First walk through the facets and plot the scatters
    for row_i, row_mask in enumerate(row_masks):
        for col_j, col_mask in enumerate(col_masks):
            ax = axes[row_i, col_j]
            if not sharex or (row_i + 1 == len(row_masks)):
                ax.set_xlabel(x)
            if not sharey or col_j == 0:
                ax.set_ylabel(y)

            # Title the plot if we are faceting
            title = ""
            if row is not None:
                title += "%s = %s" % (row, row_vals[row_i])
            if row is not None and col is not None:
                title += " | "
            if col is not None:
                title += "%s = %s" % (col, col_vals[col_j])
            ax.set_title(title)

            for hue_k, hue_mask in enumerate(hue_masks):
                color = colors[hue_k]
                data_ijk = data[row_mask & col_mask & hue_mask]

                if x_estimator is not None:
                    ms = scatter_kws.pop("ms", 7)
                    mew = scatter_kws.pop("mew", 0)
                    x_vals = data_ijk[x].unique()
                    y_grouped = [
                        np.array(data_ijk[y][data_ijk[x] == v]) for v in x_vals
                    ]
                    y_est = [x_estimator(y_i) for y_i in y_grouped]
                    y_boots = [
                        moss.bootstrap(np.array(y_i), func=x_estimator)
                        for y_i in y_grouped
                    ]
                    ci_lims = [50 - x_ci / 2., 50 + x_ci / 2.]
                    y_ci = [moss.percentiles(y_i, ci_lims) for y_i in y_boots]
                    y_error = ci_to_errsize(np.transpose(y_ci), y_est)

                    ax.plot(x_vals,
                            y_est,
                            "o",
                            mew=mew,
                            ms=ms,
                            color=color,
                            **scatter_kws)
                    ax.errorbar(x_vals, y_est, y_error, fmt=None, ecolor=color)
                else:
                    ms = scatter_kws.pop("ms", 4)
                    mew = scatter_kws.pop("mew", 0)
                    ax.plot(data_ijk[x],
                            data_ijk[y],
                            "o",
                            color=color,
                            mew=mew,
                            ms=ms,
                            **scatter_kws)

    for ax_i in np.ravel(axes):
        ax_i.set_xmargin(.05)
        ax_i.autoscale_view()

    # Now walk through again and plot the regression estimate
    # and a confidence interval for the regression line
    if fit_line:
        for row_i, row_mask in enumerate(row_masks):
            for col_j, col_mask in enumerate(col_masks):
                ax = axes[row_i, col_j]
                xlim = ax.get_xlim()

                for hue_k, hue_mask in enumerate(hue_masks):
                    color = colors[hue_k]
                    data_ijk = data[row_mask & col_mask & hue_mask]
                    x_vals = np.array(data_ijk[x])
                    y_vals = np.array(data_ijk[y])

                    # Sort out the limit of the fit
                    if truncate:
                        xx = np.linspace(x_vals.min(), x_vals.max(), 100)
                    else:
                        xx = np.linspace(xlim[0], xlim[1], 100)

                    # Inner function to bootstrap the regression
                    def _bootstrap_reg(x, y):
                        fit = np.polyfit(x, y, 1)
                        return np.polyval(fit, xx)

                    # Regression line confidence interval
                    if ci is not None:
                        ci_lims = [50 - ci / 2., 50 + ci / 2.]
                        boots = moss.bootstrap(x_vals,
                                               y_vals,
                                               func=_bootstrap_reg)
                        ci_band = moss.percentiles(boots, ci_lims, axis=0)
                        ax.fill_between(xx, *ci_band, color=color, alpha=.15)

                    fit = np.polyfit(x_vals, y_vals, 1)
                    reg = np.polyval(fit, xx)
                    if color_factor is None:
                        label = ""
                    else:
                        label = hue_vals[hue_k]
                    ax.plot(xx, reg, color=color, label=str(label), **line_kws)
                    ax.set_xlim(xlim)

    # Plot the legend on the upper left facet and adjust the layout
    if color_factor is not None:
        axes[0, 0].legend(loc="best", title=color_factor)
    plt.tight_layout()
Beispiel #3
0
def _plot_ci_bars(ax, x, central_data, ci, color, **kwargs):
    """Plot error bars at each data point."""
    err = ci_to_errsize(ci, central_data)
    ax.errorbar(x, central_data, yerr=err, fmt=None, ecolor=color)
Beispiel #4
0
	def plot(self, ax, kws):
		"""Draw the plot onto an axes, passing matplotlib kwargs."""

		# Draw a test plot, using the passed in kwargs. The goal here is to
		# honor both (a) the current state of the plot cycler and (b) the
		# specified kwargs on all the lines we will draw, overriding when
		# relevant with the data semantics. Note that we won't cycle
		# internally; in other words, if ``hue`` is not used, all elements will
		# have the same color, but they will have the color that you would have
		# gotten from the corresponding matplotlib function, and calling the
		# function will advance the axes property cycle.

		scout, = ax.plot([], [], **kws)

		orig_color = kws.pop("color", scout.get_color())
		orig_marker = kws.pop("marker", scout.get_marker())
		orig_linewidth = kws.pop("linewidth",
								 kws.pop("lw", scout.get_linewidth()))

		orig_dashes = kws.pop("dashes", "")

		kws.setdefault("markeredgewidth", kws.pop("mew", .75))
		kws.setdefault("markeredgecolor", kws.pop("mec", "w"))

		scout.remove()

		# Set default error kwargs
		err_kws = self.err_kws.copy()
		if self.err_style == "band":
			err_kws.setdefault("alpha", .2)
		elif self.err_style == "bars":
			pass
		elif self.err_style is not None:
			err = "`err_style` must be 'band' or 'bars', not {}"
			raise ValueError(err.format(self.err_style))

		# Loop over the semantic subsets and draw a line for each

		for semantics, data in self.subset_data():

			hue, size, style = semantics
			x, y, units = data["x"], data["y"], data.get("units", None)

			if self.estimator is not None:
				if self.units is not None:
					err = "estimator must be None when specifying units"
					raise ValueError(err)
				x, y, y_ci = self.aggregate(y, x, units)
			else:
				y_ci = None

			kws["color"] = self.palette.get(hue, orig_color)
			kws["dashes"] = self.dashes.get(style, orig_dashes)
			kws["marker"] = self.markers.get(style, orig_marker)
			kws["linewidth"] = self.sizes.get(size, orig_linewidth)

			line, = ax.plot([], [], **kws)
			line_color = line.get_color()
			line_alpha = line.get_alpha()
			line_capstyle = line.get_solid_capstyle()
			line.remove()

			# --- Draw the main line

			x, y = np.asarray(x), np.asarray(y)

			if self.units is None:
				line, = ax.plot(x, y, **kws)

			else:
				for u in units.unique():
					rows = np.asarray(units == u)
					ax.plot(x[rows], y[rows], **kws)

			# --- Draw the confidence intervals

			if y_ci is not None:

				low, high = np.asarray(y_ci["low"]), np.asarray(y_ci["high"])

				if self.err_style == "band":

					ax.fill_between(x, low, high, color=line_color, **err_kws)

				elif self.err_style == "bars":

					y_err = ci_to_errsize((low, high), y)
					ebars = ax.errorbar(x, y, y_err, linestyle="",
										color=line_color, alpha=line_alpha,
										**err_kws)

					# Set the capstyle properly on the error bars
					for obj in ebars.get_children():
						try:
							obj.set_capstyle(line_capstyle)
						except AttributeError:
							# Does not exist on mpl < 2.2
							pass

		# Finalize the axes details
		self.label_axes(ax)
		if self.legend:
			self.add_legend_data(ax)
			handles, _ = ax.get_legend_handles_labels()
			if handles:
				ax.legend()
Beispiel #5
0
def lmplot(x, y, data, color=None, row=None, col=None, col_wrap=None,
           x_estimator=None, x_ci=95, n_boot=5000, fit_reg=True,
           order=1, ci=95, logistic=False, truncate=False,
           x_partial=None, y_partial=None, x_jitter=None, y_jitter=None,
           sharex=True, sharey=True, palette="husl", size=None,
           scatter_kws=None, line_kws=None, palette_kws=None):
    """Plot a linear model from a DataFrame.

    Parameters
    ----------
    x, y : strings
        column names in `data` DataFrame for x and y variables
    data : DataFrame
        source of data for the model
    color : string, optional
        DataFrame column name to group the model by color
    row, col : strings, optional
        DataFrame column names to make separate plot facets
    col_wrap : int, optional
        wrap col variable at this width - cannot be used with row facet
    x_estimator : callable, optional
        Interpret X values as factor labels and use this function
        to plot the point estimate and bootstrapped CI
    x_ci : int optional
        size of confidence interval for x_estimator error bars
    n_boot : int, optional
        number of bootstrap iterations to perform
    fit_reg : bool, optional
        if True fit a regression model by color/row/col and plot
    order : int, optional
        order of the regression polynomial to fit (default = 1)
    ci : int, optional
        confidence interval for the regression line
    logistic : bool, optional
        fit the regression line with logistic regression
    truncate : bool, optional
        if True, only fit line from data min to data max
    {x, y}_partial : string or list of strings, optional
        regress these variables out of the factors before plotting
    {x, y}_jitter : float, optional
        parameters for uniformly distributed random noise added to positions
    sharex, sharey : bools, optional
        only relevant if faceting; passed to plt.subplots
    palette : seaborn color palette argument
        if using separate plots by color, draw with this color palette
    size : float, optional
        size (plots are square) for each plot facet
    {scatter, line}_kws : dictionary
        keyword arguments to pass to the underlying plot functions
    palette_kws : dictionary
        keyword arguments for seaborn.color_palette

    """
    # TODO
    # - legend when fit_line is False
    # - wrap title when wide

    # First sort out the general figure layout
    if size is None:
        size = mpl.rcParams["figure.figsize"][1]

    if col is None and col_wrap is not None:
        raise ValueError("Need column facet variable for `col_wrap`")
    if row is not None and col_wrap is not None:
        raise ValueError("Cannot facet rows when using `col_wrap`")

    nrow = 1 if row is None else len(data[row].unique())
    ncol = 1 if col is None else len(data[col].unique())

    if col_wrap is not None:
        ncol = col_wrap
        nrow = int(np.ceil(len(data[col].unique()) / col_wrap))

    f, axes = plt.subplots(nrow, ncol, sharex=sharex, sharey=sharey,
                           figsize=(size * ncol, size * nrow))
    axes = np.atleast_2d(axes).reshape(nrow, ncol)

    if nrow == 1 or col_wrap is not None:
        row_masks = [np.repeat(True, len(data))]
    else:
        row_vals = np.sort(data[row].unique())
        row_masks = [data[row] == val for val in row_vals]

    if ncol == 1:
        col_masks = [np.repeat(True, len(data))]
    else:
        col_vals = np.sort(data[col].unique())
        col_masks = [data[col] == val for val in col_vals]

    if x_partial is not None:
        if not isinstance(x_partial, list):
            x_partial = [x_partial]
    if y_partial is not None:
        if not isinstance(y_partial, list):
            y_partial = [y_partial]

    if palette_kws is None:
        palette_kws = {}

    # Sort out the plot colors
    color_factor = color
    if color is None:
        hue_masks = [np.repeat(True, len(data))]
        colors = ["#222222"]
    else:
        hue_vals = np.sort(data[color].unique())
        hue_masks = [data[color] == val for val in hue_vals]
        colors = color_palette(palette, len(hue_masks), **palette_kws)

    # Default keyword arguments for plot components
    if scatter_kws is None:
        scatter_kws = {}
    if line_kws is None:
        line_kws = {}

    # First walk through the facets and plot the scatters
    scatter_ms = scatter_kws.pop("ms", 4)
    scatter_mew = mew = scatter_kws.pop("mew", 0)
    scatter_alpha = mew = scatter_kws.pop("alpha", .77)
    for row_i, row_mask in enumerate(row_masks):
        for col_j, col_mask in enumerate(col_masks):
            if col_wrap is not None:
                f_row = col_j // ncol
                f_col = col_j % ncol
            else:
                f_row, f_col = row_i, col_j
            ax = axes[f_row, f_col]
            if f_row + 1 == nrow:
                ax.set_xlabel(x)
            if f_col == 0:
                ax.set_ylabel(y)

            # Title the plot if we are faceting
            title = ""
            if row is not None:
                title += "%s = %s" % (row, row_vals[row_i])
            if row is not None and col is not None:
                title += " | "
            if col is not None:
                title += "%s = %s" % (col, col_vals[col_j])
            ax.set_title(title)

            for hue_k, hue_mask in enumerate(hue_masks):
                color = colors[hue_k]
                data_ijk = data[row_mask & col_mask & hue_mask]

                if x_estimator is not None:
                    ms = scatter_kws.pop("ms", 7)
                    mew = scatter_kws.pop("mew", 0)
                    x_vals = data_ijk[x].unique()
                    y_vals = data_ijk[y]

                    if y_partial is not None:
                        for var in y_partial:
                            conf = data_ijk[var]
                            conf -= conf.mean()
                            y_mean = y_vals.mean()
                            y_vals = moss.vector_reject(y_vals - y_mean, conf)
                            y_vals += y_mean

                    y_grouped = [np.array(y_vals[data_ijk[x] == v])
                                 for v in x_vals]

                    y_est = [x_estimator(y_i) for y_i in y_grouped]
                    y_boots = [moss.bootstrap(np.array(y_i),
                                              func=x_estimator,
                                              n_boot=n_boot)
                               for y_i in y_grouped]
                    ci_lims = [50 - x_ci / 2., 50 + x_ci / 2.]
                    y_ci = [moss.percentiles(y_i, ci_lims) for y_i in y_boots]
                    y_error = ci_to_errsize(np.transpose(y_ci), y_est)

                    ax.plot(x_vals, y_est, "o", mew=mew, ms=ms,
                            color=color, **scatter_kws)
                    ax.errorbar(x_vals, y_est, y_error,
                                fmt=None, ecolor=color)
                else:
                    x_ = data_ijk[x]
                    y_ = data_ijk[y]

                    if x_partial is not None:
                        for var in x_partial:
                            conf = data_ijk[var]
                            conf -= conf.mean()
                            x_mean = x_.mean()
                            x_ = moss.vector_reject(x_ - x_mean, conf)
                            x_ += x_mean
                    if y_partial is not None:
                        for var in y_partial:
                            conf = data_ijk[var]
                            conf -= conf.mean()
                            y_mean = y_.mean()
                            y_ = moss.vector_reject(y_ - y_mean, conf)
                            y_ += y_mean

                    if x_jitter is not None:
                        x_ += np.random.uniform(-x_jitter, x_jitter, x_.shape)
                    if y_jitter is not None:
                        y_ += np.random.uniform(-y_jitter, y_jitter, y_.shape)
                    ax.plot(x_, y_, "o", color=color, alpha=scatter_alpha,
                            mew=scatter_mew, ms=scatter_ms, **scatter_kws)

    for ax_i in np.ravel(axes):
        ax_i.set_xmargin(.05)
        ax_i.autoscale_view()

    # Now walk through again and plot the regression estimate
    # and a confidence interval for the regression line
    if fit_reg:
        for row_i, row_mask in enumerate(row_masks):
            for col_j, col_mask in enumerate(col_masks):
                if col_wrap is not None:
                    f_row = col_j // ncol
                    f_col = col_j % ncol
                else:
                    f_row, f_col = row_i, col_j
                ax = axes[f_row, f_col]
                xlim = ax.get_xlim()

                for hue_k, hue_mask in enumerate(hue_masks):
                    color = colors[hue_k]
                    data_ijk = data[row_mask & col_mask & hue_mask]
                    x_vals = np.array(data_ijk[x])
                    y_vals = np.array(data_ijk[y])
                    if not len(x_vals):
                        continue

                    # Sort out the limit of the fit
                    if truncate:
                        xx = np.linspace(x_vals.min(),
                                         x_vals.max(), 100)
                    else:
                        xx = np.linspace(xlim[0], xlim[1], 100)
                    xx_ = sm.add_constant(xx, prepend=True)

                    # Inner function to bootstrap the regression
                    def _regress(x, y):
                        if logistic:
                            x_ = sm.add_constant(x, prepend=True)
                            fit = sm.GLM(y, x_,
                                         family=sm.families.Binomial()).fit()
                            reg = fit.predict(xx_)
                        else:
                            fit = np.polyfit(x, y, order)
                            reg = np.polyval(fit, xx)
                        return reg

                    # Remove nuisance variables with vector rejection
                    if x_partial is not None:
                        for var in x_partial:
                            conf = data_ijk[var]
                            conf -= conf.mean()
                            x_mean = x_vals.mean()
                            x_vals = moss.vector_reject(x_vals - x_mean, conf)
                            x_vals += x_mean
                    if y_partial is not None:
                        for var in y_partial:
                            conf = data_ijk[var]
                            conf -= conf.mean()
                            y_mean = y_vals.mean()
                            y_vals = moss.vector_reject(y_vals - y_mean, conf)
                            y_vals += y_mean

                    # Regression line confidence interval
                    if ci is not None:
                        ci_lims = [50 - ci / 2., 50 + ci / 2.]
                        boots = moss.bootstrap(x_vals, y_vals,
                                               func=_regress,
                                               n_boot=n_boot)
                        ci_band = moss.percentiles(boots, ci_lims, axis=0)
                        ax.fill_between(xx, *ci_band, color=color, alpha=.15)

                    # Regression line
                    reg = _regress(x_vals, y_vals)
                    if color_factor is None:
                        label = ""
                    else:
                        label = hue_vals[hue_k]
                    ax.plot(xx, reg, color=color,
                            label=str(label), **line_kws)
                    ax.set_xlim(xlim)

    # Plot the legend on the upper left facet and adjust the layout
    if color_factor is not None and color_factor not in [row, col]:
        axes[0, 0].legend(loc="best", title=color_factor)
    plt.tight_layout()
Beispiel #6
0
def lmplot(x, y, data, color=None, row=None, col=None, col_wrap=None,
           x_estimator=None, x_ci=95, x_bins=None, n_boot=5000, fit_reg=True,
           order=1, ci=95, logistic=False, truncate=False,
           x_partial=None, y_partial=None, x_jitter=None, y_jitter=None,
           sharex=True, sharey=True, palette="husl", size=None,
           scatter_kws=None, line_kws=None, palette_kws=None):
    """Plot a linear model with faceting, color binning, and other options.

    Parameters
    ----------
    x, y : strings
        Column names in `data` DataFrame for x and y variables.
    data : DataFrame
        Dource of data for the model.
    color : string, optional
        DataFrame column name to group the model by color.
    row, col : strings, optional
        DataFrame column names to make separate plot facets.
    col_wrap : int, optional
        Wrap col variable at this width - cannot be used with row facet.
    x_estimator : callable, optional
        Interpret X values as factor labels and use this function
        to plot the point estimate and bootstrapped CI.
    x_ci : int optional
        Size of confidence interval for x_estimator error bars.
    x_bins : sequence of floats, optional
        Bin the x variable with these values. Implies that x_estimator is
        mean, unless otherwise provided.
    n_boot : int, optional
        Number of bootstrap iterations to perform.
    fit_reg : bool, optional
        If True fit a regression model by color/row/col and plot.
    order : int, optional
        Order of the regression polynomial to fit.
    ci : int, optional
        Confidence interval for the regression line.
    logistic : bool, optional
        Fit the regression line with logistic regression.
    truncate : bool, optional
        If True, only fit line from data min to data max.
    {x, y}_partial : string or list of strings, optional
        Regress these variables out of the factors before plotting.
    {x, y}_jitter : float, optional
        Parameters for uniformly distributed random noise added to positions.
    sharex, sharey : bools, optional
        Only relevant if faceting; passed to plt.subplots.
    palette : seaborn color palette argument
        If using separate plots by color, draw with this color palette.
    size : float, optional
        Size (plots are square) for each plot facet.
    {scatter, line}_kws : dictionary
        Keyword arguments to pass to the underlying plot functions.
    palette_kws : dictionary
        Keyword arguments for seaborn.color_palette.

    """
    # TODO
    # - legend when fit_line is False

    # First sort out the general figure layout
    if size is None:
        size = mpl.rcParams["figure.figsize"][1]

    if col is None and col_wrap is not None:
        raise ValueError("Need column facet variable for `col_wrap`")
    if row is not None and col_wrap is not None:
        raise ValueError("Cannot facet rows when using `col_wrap`")

    nrow = 1 if row is None else len(data[row].unique())
    ncol = 1 if col is None else len(data[col].unique())

    if col_wrap is not None:
        ncol = col_wrap
        nrow = int(np.ceil(len(data[col].unique()) / col_wrap))

    f, axes = plt.subplots(nrow, ncol, sharex=sharex, sharey=sharey,
                           figsize=(size * ncol, size * nrow))
    axes = np.atleast_2d(axes).reshape(nrow, ncol)

    if nrow == 1 or col_wrap is not None:
        row_masks = [np.repeat(True, len(data))]
    else:
        row_vals = np.sort(data[row].unique())
        row_masks = [data[row] == val for val in row_vals]

    if ncol == 1:
        col_masks = [np.repeat(True, len(data))]
    else:
        col_vals = np.sort(data[col].unique())
        col_masks = [data[col] == val for val in col_vals]

    if x_bins is not None:
        x_estimator = np.mean if x_estimator is None else x_estimator
        x_bins = np.c_[x_bins]

    if x_partial is not None:
        if not isinstance(x_partial, list):
            x_partial = [x_partial]
    if y_partial is not None:
        if not isinstance(y_partial, list):
            y_partial = [y_partial]

    if palette_kws is None:
        palette_kws = {}

    # Sort out the plot colors
    color_factor = color
    if color is None:
        hue_masks = [np.repeat(True, len(data))]
        colors = ["#222222"]
    else:
        hue_vals = np.sort(data[color].unique())
        hue_masks = [data[color] == val for val in hue_vals]
        colors = color_palette(palette, len(hue_masks), **palette_kws)

    # Default keyword arguments for plot components
    if scatter_kws is None:
        scatter_kws = {}
    if line_kws is None:
        line_kws = {}

    # First walk through the facets and plot the scatters
    scatter_ms = scatter_kws.pop("ms", 4)
    scatter_mew = mew = scatter_kws.pop("mew", 0)
    scatter_alpha = mew = scatter_kws.pop("alpha", .77)
    for row_i, row_mask in enumerate(row_masks):
        for col_j, col_mask in enumerate(col_masks):
            if col_wrap is not None:
                f_row = col_j // ncol
                f_col = col_j % ncol
            else:
                f_row, f_col = row_i, col_j
            ax = axes[f_row, f_col]
            if f_row + 1 == nrow:
                ax.set_xlabel(x)
            if f_col == 0:
                ax.set_ylabel(y)

            # Title the plot if we are faceting
            title = ""
            if row is not None:
                title += "%s = %s" % (row, row_vals[row_i])
            if row is not None and col is not None:
                title += " | "
            if col is not None:
                title += "%s = %s" % (col, col_vals[col_j])
            if size < 3:
                title = title.replace(" | ", "\n")
            ax.set_title(title)

            for hue_k, hue_mask in enumerate(hue_masks):
                color = colors[hue_k]
                data_ijk = data[row_mask & col_mask & hue_mask]

                if x_estimator is not None:
                    ms = scatter_kws.pop("ms", 7)
                    mew = scatter_kws.pop("mew", 0)
                    if x_bins is None:
                        x_vals = data_ijk[x].unique()
                        x_data = data_ijk[x]
                    else:
                        dist = distance.cdist(np.c_[data_ijk[x]], x_bins)
                        x_vals = x_bins.ravel()
                        x_data = x_bins[np.argmin(dist, axis=1)].ravel()

                    y_vals = data_ijk[y]

                    if y_partial is not None:
                        for var in y_partial:
                            conf = data_ijk[var]
                            conf -= conf.mean()
                            y_mean = y_vals.mean()
                            y_vals = moss.vector_reject(y_vals - y_mean, conf)
                            y_vals += y_mean

                    y_grouped = [np.array(y_vals[x_data == v])
                                 for v in x_vals]

                    y_est = [x_estimator(y_i) for y_i in y_grouped]
                    y_boots = [moss.bootstrap(np.array(y_i),
                                              func=x_estimator,
                                              n_boot=n_boot)
                               for y_i in y_grouped]
                    ci_lims = [50 - x_ci / 2., 50 + x_ci / 2.]
                    y_ci = [moss.percentiles(y_i, ci_lims) for y_i in y_boots]
                    y_error = ci_to_errsize(np.transpose(y_ci), y_est)

                    ax.plot(x_vals, y_est, "o", mew=mew, ms=ms,
                            color=color, **scatter_kws)
                    ax.errorbar(x_vals, y_est, y_error,
                                fmt=None, ecolor=color)
                else:
                    x_ = data_ijk[x]
                    y_ = data_ijk[y]

                    if x_partial is not None:
                        for var in x_partial:
                            conf = data_ijk[var]
                            conf -= conf.mean()
                            x_mean = x_.mean()
                            x_ = moss.vector_reject(x_ - x_mean, conf)
                            x_ += x_mean
                    if y_partial is not None:
                        for var in y_partial:
                            conf = data_ijk[var]
                            conf -= conf.mean()
                            y_mean = y_.mean()
                            y_ = moss.vector_reject(y_ - y_mean, conf)
                            y_ += y_mean

                    if x_jitter is not None:
                        x_ += np.random.uniform(-x_jitter, x_jitter, x_.shape)
                    if y_jitter is not None:
                        y_ += np.random.uniform(-y_jitter, y_jitter, y_.shape)
                    ax.plot(x_, y_, "o", color=color, alpha=scatter_alpha,
                            mew=scatter_mew, ms=scatter_ms, **scatter_kws)

    for ax_i in np.ravel(axes):
        ax_i.set_xmargin(.05)
        ax_i.autoscale_view()

    # Now walk through again and plot the regression estimate
    # and a confidence interval for the regression line
    if fit_reg:
        for row_i, row_mask in enumerate(row_masks):
            for col_j, col_mask in enumerate(col_masks):
                if col_wrap is not None:
                    f_row = col_j // ncol
                    f_col = col_j % ncol
                else:
                    f_row, f_col = row_i, col_j
                ax = axes[f_row, f_col]
                xlim = ax.get_xlim()

                for hue_k, hue_mask in enumerate(hue_masks):
                    color = colors[hue_k]
                    data_ijk = data[row_mask & col_mask & hue_mask]
                    x_vals = np.array(data_ijk[x])
                    y_vals = np.array(data_ijk[y])
                    if not len(x_vals):
                        continue

                    # Sort out the limit of the fit
                    if truncate:
                        xx = np.linspace(x_vals.min(),
                                         x_vals.max(), 100)
                    else:
                        xx = np.linspace(xlim[0], xlim[1], 100)
                    xx_ = sm.add_constant(xx, prepend=True)

                    # Inner function to bootstrap the regression
                    def _regress(x, y):
                        if logistic:
                            x_ = sm.add_constant(x, prepend=True)
                            fit = sm.GLM(y, x_,
                                         family=sm.families.Binomial()).fit()
                            reg = fit.predict(xx_)
                        else:
                            fit = np.polyfit(x, y, order)
                            reg = np.polyval(fit, xx)
                        return reg

                    # Remove nuisance variables with vector rejection
                    if x_partial is not None:
                        for var in x_partial:
                            conf = data_ijk[var]
                            conf -= conf.mean()
                            x_mean = x_vals.mean()
                            x_vals = moss.vector_reject(x_vals - x_mean, conf)
                            x_vals += x_mean
                    if y_partial is not None:
                        for var in y_partial:
                            conf = data_ijk[var]
                            conf -= conf.mean()
                            y_mean = y_vals.mean()
                            y_vals = moss.vector_reject(y_vals - y_mean, conf)
                            y_vals += y_mean

                    # Regression line confidence interval
                    if ci is not None:
                        ci_lims = [50 - ci / 2., 50 + ci / 2.]
                        boots = moss.bootstrap(x_vals, y_vals,
                                               func=_regress,
                                               n_boot=n_boot)
                        ci_band = moss.percentiles(boots, ci_lims, axis=0)
                        ax.fill_between(xx, *ci_band, color=color, alpha=.15)

                    # Regression line
                    reg = _regress(x_vals, y_vals)
                    if color_factor is None:
                        label = ""
                    else:
                        label = hue_vals[hue_k]
                    ax.plot(xx, reg, color=color,
                            label=str(label), **line_kws)
                    ax.set_xlim(xlim)

    # Plot the legend on the upper left facet and adjust the layout
    if color_factor is not None and color_factor not in [row, col]:
        axes[0, 0].legend(loc="best", title=color_factor)
    plt.tight_layout()
Beispiel #7
0
def _plot_ci_bars(ax, x, central_data, ci, color, err_kws, **kwargs):
    """Plot error bars at each data point."""
    err = ci_to_errsize(ci, central_data)
    ax.errorbar(x, central_data, yerr=err, fmt=None, ecolor=color,
                label="_nolegend_", **err_kws)
Beispiel #8
0
def _plot_ci_bars(ax, x, central_data, ci, color, err_kws, **kwargs):
    """Plot error bars at each data point."""
    err = ci_to_errsize(ci, central_data)
    ax.errorbar(x, central_data, yerr=err, fmt=None, ecolor=color,
                label="_nolegend_", **err_kws)
Beispiel #9
0
def lmplot(x, y, data, color=None, row=None, col=None,
           x_estimator=None, x_ci=95,
           fit_line=True, ci=95, truncate=False,
           sharex=True, sharey=True, palette="hls", size=None,
           scatter_kws=None, line_kws=None, palette_kws=None):
    """Plot a linear model from a DataFrame.

    Parameters
    ----------
    x, y : strings
        column names in `data` DataFrame for x and y variables
    data : DataFrame
        source of data for the model
    color : string, optional
        DataFrame column name to group the model by color
    row, col : strings, optional
        DataFrame column names to make separate plot facets
    x_estimator : callable, optional
        Interpret X values as factor labels and use this function
        to plot the point estimate and bootstrapped CI
    x_ci : int optional
        size of confidence interval for x_estimator error bars
    fit_line : bool, optional
        if True fit a regression line by color/row/col and plot
    ci : int, optional
        confidence interval for the regression line
    truncate : bool, optional
        if True, only fit line from data min to data max
    sharex, sharey : bools, optional
        only relevant if faceting; passed to plt.subplots
    palette : seaborn color palette argument
        if using separate plots by color, draw with this color palette
    size : float, optional
        size (plots are square) for each plot facet
    {scatter, line}_kws : dictionary
        keyword arguments to pass to the underlying plot functions
    palette_kws : dictionary
        keyword arguments for seaborn.color_palette

    """
    # TODO
    # - position_{dodge, jitter}
    # - legend when fit_line is False
    # - truncate fit
    # - wrap title when wide
    # - wrap columns

    # First sort out the general figure layout
    if size is None:
        size = mpl.rcParams["figure.figsize"][1]

    nrow = 1 if row is None else len(data[row].unique())
    ncol = 1 if col is None else len(data[col].unique())

    f, axes = plt.subplots(nrow, ncol, sharex=sharex, sharey=sharey,
                           figsize=(size * ncol, size * nrow))
    axes = np.atleast_2d(axes).reshape(nrow, ncol)

    if nrow == 1:
        row_masks = [np.repeat(True, len(data))]
    else:
        row_vals = np.sort(data[row].unique())
        row_masks = [data[row] == val for val in row_vals]

    if ncol == 1:
        col_masks = [np.repeat(True, len(data))]
    else:
        col_vals = np.sort(data[col].unique())
        col_masks = [data[col] == val for val in col_vals]

    if palette_kws is None:
        palette_kws = {}

    # Sort out the plot colors
    color_factor = color
    if color is None:
        hue_masks = [np.repeat(True, len(data))]
        colors = ["#222222"]
    else:
        hue_vals = np.sort(data[color].unique())
        hue_masks = [data[color] == val for val in hue_vals]
        colors = color_palette(palette, len(hue_masks), **palette_kws)

    # Default keyword arguments for plot components
    if scatter_kws is None:
        scatter_kws = {}
    if line_kws is None:
        line_kws = {}

    # First walk through the facets and plot the scatters
    for row_i, row_mask in enumerate(row_masks):
        for col_j, col_mask in enumerate(col_masks):
            ax = axes[row_i, col_j]
            if not sharex or (row_i + 1 == len(row_masks)):
                ax.set_xlabel(x)
            if not sharey or col_j == 0:
                ax.set_ylabel(y)

            # Title the plot if we are faceting
            title = ""
            if row is not None:
                title += "%s = %s" % (row, row_vals[row_i])
            if row is not None and col is not None:
                title += " | "
            if col is not None:
                title += "%s = %s" % (col, col_vals[col_j])
            ax.set_title(title)

            for hue_k, hue_mask in enumerate(hue_masks):
                color = colors[hue_k]
                data_ijk = data[row_mask & col_mask & hue_mask]

                if x_estimator is not None:
                    ms = scatter_kws.pop("ms", 7)
                    mew = scatter_kws.pop("mew", 0)
                    x_vals = data_ijk[x].unique()
                    y_grouped = [np.array(data_ijk[y][data_ijk[x] == v])
                                 for v in x_vals]
                    y_est = [x_estimator(y_i) for y_i in y_grouped]
                    y_boots = [moss.bootstrap(np.array(y_i), func=x_estimator)
                               for y_i in y_grouped]
                    ci_lims = [50 - x_ci / 2., 50 + x_ci / 2.]
                    y_ci = [moss.percentiles(y_i, ci_lims) for y_i in y_boots]
                    y_error = ci_to_errsize(np.transpose(y_ci), y_est)

                    ax.plot(x_vals, y_est, "o", mew=mew, ms=ms,
                            color=color, **scatter_kws)
                    ax.errorbar(x_vals, y_est, y_error,
                                fmt=None, ecolor=color)
                else:
                    ms = scatter_kws.pop("ms", 4)
                    mew = scatter_kws.pop("mew", 0)
                    ax.plot(data_ijk[x], data_ijk[y], "o",
                            color=color, mew=mew, ms=ms, **scatter_kws)

    for ax_i in np.ravel(axes):
        ax_i.set_xmargin(.05)
        ax_i.autoscale_view()

    # Now walk through again and plot the regression estimate
    # and a confidence interval for the regression line
    if fit_line:
        for row_i, row_mask in enumerate(row_masks):
            for col_j, col_mask in enumerate(col_masks):
                ax = axes[row_i, col_j]
                xlim = ax.get_xlim()

                for hue_k, hue_mask in enumerate(hue_masks):
                    color = colors[hue_k]
                    data_ijk = data[row_mask & col_mask & hue_mask]
                    x_vals = np.array(data_ijk[x])
                    y_vals = np.array(data_ijk[y])

                    # Sort out the limit of the fit
                    if truncate:
                        xx = np.linspace(x_vals.min(),
                                         x_vals.max(), 100)
                    else:
                        xx = np.linspace(xlim[0], xlim[1], 100)

                    # Inner function to bootstrap the regression
                    def _bootstrap_reg(x, y):
                        fit = np.polyfit(x, y, 1)
                        return np.polyval(fit, xx)

                    # Regression line confidence interval
                    if ci is not None:
                        ci_lims = [50 - ci / 2., 50 + ci / 2.]
                        boots = moss.bootstrap(x_vals, y_vals,
                                               func=_bootstrap_reg)
                        ci_band = moss.percentiles(boots, ci_lims, axis=0)
                        ax.fill_between(xx, *ci_band, color=color, alpha=.15)

                    fit = np.polyfit(x_vals, y_vals, 1)
                    reg = np.polyval(fit, xx)
                    if color_factor is None:
                        label = ""
                    else:
                        label = hue_vals[hue_k]
                    ax.plot(xx, reg, color=color,
                            label=str(label), **line_kws)
                    ax.set_xlim(xlim)

    # Plot the legend on the upper left facet and adjust the layout
    if color_factor is not None:
        axes[0, 0].legend(loc="best", title=color_factor)
    plt.tight_layout()
Beispiel #10
0
def _plot_ci_bars(ax, x, central_data, ci, color, **kwargs):
    """Plot error bars at each data point."""
    err = ci_to_errsize(ci, central_data)
    ax.errorbar(x, central_data, yerr=err, fmt=None, ecolor=color)