Exemple #1
0
    def test_dropna(self):

        p = lm._RegressionPlotter("x", "y_na", data=self.df)
        assert len(p.x) == pd.notnull(self.df.y_na).sum()

        p = lm._RegressionPlotter("x", "y_na", data=self.df, dropna=False)
        assert len(p.x) == len(self.df.y_na)
Exemple #2
0
    def test_estimate_cis(self):

        seed = 123

        p = lm._RegressionPlotter(self.df.d,
                                  self.df.y,
                                  x_estimator=np.mean,
                                  ci=95,
                                  seed=seed)
        _, _, ci_big = p.estimate_data

        p = lm._RegressionPlotter(self.df.d,
                                  self.df.y,
                                  x_estimator=np.mean,
                                  ci=50,
                                  seed=seed)
        _, _, ci_wee = p.estimate_data
        npt.assert_array_less(np.diff(ci_wee), np.diff(ci_big))

        p = lm._RegressionPlotter(self.df.d,
                                  self.df.y,
                                  x_estimator=np.mean,
                                  ci=None)
        _, _, ci_nil = p.estimate_data
        npt.assert_array_equal(ci_nil, [None] * len(ci_nil))
Exemple #3
0
    def test_variables_must_be_1d(self):

        array_2d = np.random.randn(20, 2)
        array_1d = np.random.randn(20)
        with pytest.raises(ValueError):
            lm._RegressionPlotter(array_2d, array_1d)
        with pytest.raises(ValueError):
            lm._RegressionPlotter(array_1d, array_2d)
Exemple #4
0
    def test_regression_options(self):

        with pytest.raises(ValueError):
            lm._RegressionPlotter("x", "y", data=self.df, lowess=True, order=2)

        with pytest.raises(ValueError):
            lm._RegressionPlotter("x",
                                  "y",
                                  data=self.df,
                                  lowess=True,
                                  logistic=True)
Exemple #5
0
    def test_ci(self):

        p = lm._RegressionPlotter("x", "y", data=self.df, ci=95)
        assert p.ci == 95
        assert p.x_ci == 95

        p = lm._RegressionPlotter("x", "y", data=self.df, ci=95, x_ci=68)
        assert p.ci == 95
        assert p.x_ci == 68

        p = lm._RegressionPlotter("x", "y", data=self.df, ci=95, x_ci="sd")
        assert p.ci == 95
        assert p.x_ci == "sd"
Exemple #6
0
    def test_regression_limits(self):

        f, ax = plt.subplots()
        ax.scatter(self.df.x, self.df.y)
        p = lm._RegressionPlotter("x", "y", data=self.df)
        grid, _, _ = p.fit_regression(ax)
        xlim = ax.get_xlim()
        assert grid.min() == xlim[0]
        assert grid.max() == xlim[1]

        p = lm._RegressionPlotter("x", "y", data=self.df, truncate=True)
        grid, _, _ = p.fit_regression()
        assert grid.min() == self.df.x.min()
        assert grid.max() == self.df.x.max()
Exemple #7
0
    def test_variables_from_mix(self):

        p = lm._RegressionPlotter("x", self.df.y + 1, data=self.df)

        npt.assert_array_equal(p.x, self.df.x)
        npt.assert_array_equal(p.y, self.df.y + 1)
        pdt.assert_frame_equal(p.data, self.df)
Exemple #8
0
    def test_lowess_regression(self):

        p = lm._RegressionPlotter("x", "y", data=self.df, lowess=True)
        grid, yhat, err_bands = p.fit_regression(x_range=(-3, 3))

        assert len(grid) == len(yhat)
        assert err_bands is None
Exemple #9
0
    def test_robust_regression(self):

        p_ols = lm._RegressionPlotter("x",
                                      "y",
                                      data=self.df,
                                      n_boot=self.n_boot)
        _, ols_yhat, _ = p_ols.fit_regression(x_range=(-3, 3))

        p_robust = lm._RegressionPlotter("x",
                                         "y",
                                         data=self.df,
                                         robust=True,
                                         n_boot=self.n_boot)
        _, robust_yhat, _ = p_robust.fit_regression(x_range=(-3, 3))

        assert len(ols_yhat) == len(robust_yhat)
Exemple #10
0
    def test_variables_from_series(self):

        p = lm._RegressionPlotter(self.df.x, self.df.y, units=self.df.s)

        npt.assert_array_equal(p.x, self.df.x)
        npt.assert_array_equal(p.y, self.df.y)
        npt.assert_array_equal(p.units, self.df.s)
        assert p.data is None
Exemple #11
0
    def test_variables_from_frame(self):

        p = lm._RegressionPlotter("x", "y", data=self.df, units="s")

        pdt.assert_series_equal(p.x, self.df.x)
        pdt.assert_series_equal(p.y, self.df.y)
        pdt.assert_series_equal(p.units, self.df.s)
        pdt.assert_frame_equal(p.data, self.df)
Exemple #12
0
    def test_regress_bootstrap_seed(self):

        seed = 200
        p1 = lm._RegressionPlotter("x",
                                   "y",
                                   data=self.df,
                                   n_boot=self.n_boot,
                                   seed=seed)
        p2 = lm._RegressionPlotter("x",
                                   "y",
                                   data=self.df,
                                   n_boot=self.n_boot,
                                   seed=seed)

        _, boots1 = p1.fit_fast(self.grid)
        _, boots2 = p2.fit_fast(self.grid)
        npt.assert_array_equal(boots1, boots2)
Exemple #13
0
    def test_estimate_data(self):

        p = lm._RegressionPlotter(self.df.d, self.df.y, x_estimator=np.mean)

        x, y, ci = p.estimate_data

        npt.assert_array_equal(x, np.sort(np.unique(self.df.d)))
        npt.assert_array_almost_equal(y, self.df.groupby("d").y.mean())
        npt.assert_array_less(np.array(ci)[:, 0], y)
        npt.assert_array_less(y, np.array(ci)[:, 1])
Exemple #14
0
    def test_estimate_units(self):

        # Seed the RNG locally
        seed = 345

        p = lm._RegressionPlotter("x",
                                  "y",
                                  data=self.df,
                                  units="s",
                                  seed=seed,
                                  x_bins=3)
        _, _, ci_big = p.estimate_data
        ci_big = np.diff(ci_big, axis=1)

        p = lm._RegressionPlotter("x", "y", data=self.df, seed=seed, x_bins=3)
        _, _, ci_wee = p.estimate_data
        ci_wee = np.diff(ci_wee, axis=1)

        npt.assert_array_less(ci_wee, ci_big)
Exemple #15
0
    def test_logistic_regression(self):

        p = lm._RegressionPlotter("x",
                                  "c",
                                  data=self.df,
                                  logistic=True,
                                  n_boot=self.n_boot)
        _, yhat, _ = p.fit_regression(x_range=(-3, 3))
        npt.assert_array_less(yhat, 1)
        npt.assert_array_less(0, yhat)
Exemple #16
0
    def test_logistic_perfect_separation(self):

        y = self.df.x > self.df.x.mean()
        p = lm._RegressionPlotter("x",
                                  y,
                                  data=self.df,
                                  logistic=True,
                                  n_boot=10)
        with np.errstate(all="ignore"):
            _, yhat, _ = p.fit_regression(x_range=(-3, 3))
        assert np.isnan(yhat).all()
Exemple #17
0
    def test_regress_poly(self):

        p = lm._RegressionPlotter("x", "y", data=self.df, n_boot=self.n_boot)

        # Fit an first-order polynomial
        yhat_poly, _ = p.fit_poly(self.grid, 1)

        # Fit using the statsmodels function with an OLS model
        yhat_smod, _ = p.fit_statsmodels(self.grid, smlm.OLS)

        # Compare the vector of y_hat values
        npt.assert_array_almost_equal(yhat_poly, yhat_smod)
Exemple #18
0
    def test_fast_regression(self):

        p = lm._RegressionPlotter("x", "y", data=self.df, n_boot=self.n_boot)

        # Fit with the "fast" function, which just does linear algebra
        yhat_fast, _ = p.fit_fast(self.grid)

        # Fit using the statsmodels function with an OLS model
        yhat_smod, _ = p.fit_statsmodels(self.grid, smlm.OLS)

        # Compare the vector of y_hat values
        npt.assert_array_almost_equal(yhat_fast, yhat_smod)
Exemple #19
0
    def test_scatter_data(self):

        p = lm._RegressionPlotter(self.df.x, self.df.y)
        x, y = p.scatter_data
        npt.assert_array_equal(x, self.df.x)
        npt.assert_array_equal(y, self.df.y)

        p = lm._RegressionPlotter(self.df.d, self.df.y)
        x, y = p.scatter_data
        npt.assert_array_equal(x, self.df.d)
        npt.assert_array_equal(y, self.df.y)

        p = lm._RegressionPlotter(self.df.d, self.df.y, x_jitter=.1)
        x, y = p.scatter_data
        assert (x != self.df.d).any()
        npt.assert_array_less(np.abs(self.df.d - x), np.repeat(.1, len(x)))
        npt.assert_array_equal(y, self.df.y)

        p = lm._RegressionPlotter(self.df.d, self.df.y, y_jitter=.05)
        x, y = p.scatter_data
        npt.assert_array_equal(x, self.df.d)
        npt.assert_array_less(np.abs(self.df.y - y), np.repeat(.1, len(y)))
Exemple #20
0
    def test_partial(self):

        x = self.rs.randn(100)
        y = x + self.rs.randn(100)
        z = x + self.rs.randn(100)

        p = lm._RegressionPlotter(y, z)
        _, r_orig = np.corrcoef(p.x, p.y)[0]

        p = lm._RegressionPlotter(y, z, y_partial=x)
        _, r_semipartial = np.corrcoef(p.x, p.y)[0]
        assert r_semipartial < r_orig

        p = lm._RegressionPlotter(y, z, x_partial=x, y_partial=x)
        _, r_partial = np.corrcoef(p.x, p.y)[0]
        assert r_partial < r_orig

        x = pd.Series(x)
        y = pd.Series(y)
        p = lm._RegressionPlotter(y, z, x_partial=x, y_partial=x)
        _, r_partial = np.corrcoef(p.x, p.y)[0]
        assert r_partial < r_orig
Exemple #21
0
    def test_regress_logx(self):

        x = np.arange(1, 10)
        y = np.arange(1, 10)
        grid = np.linspace(1, 10, 100)
        p = lm._RegressionPlotter(x, y, n_boot=self.n_boot)

        yhat_lin, _ = p.fit_fast(grid)
        yhat_log, _ = p.fit_logx(grid)

        assert yhat_lin[0] > yhat_log[0]
        assert yhat_log[20] > yhat_lin[20]
        assert yhat_lin[90] > yhat_log[90]
Exemple #22
0
    def test_regress_n_boot(self):

        p = lm._RegressionPlotter("x", "y", data=self.df, n_boot=self.n_boot)

        # Fast (linear algebra) version
        _, boots_fast = p.fit_fast(self.grid)
        npt.assert_equal(boots_fast.shape, (self.n_boot, self.grid.size))

        # Slower (np.polyfit) version
        _, boots_poly = p.fit_poly(self.grid, 1)
        npt.assert_equal(boots_poly.shape, (self.n_boot, self.grid.size))

        # Slowest (statsmodels) version
        _, boots_smod = p.fit_statsmodels(self.grid, smlm.OLS)
        npt.assert_equal(boots_smod.shape, (self.n_boot, self.grid.size))
Exemple #23
0
    def test_regress_without_bootstrap(self):

        p = lm._RegressionPlotter("x",
                                  "y",
                                  data=self.df,
                                  n_boot=self.n_boot,
                                  ci=None)

        # Fast (linear algebra) version
        _, boots_fast = p.fit_fast(self.grid)
        assert boots_fast is None

        # Slower (np.polyfit) version
        _, boots_poly = p.fit_poly(self.grid, 1)
        assert boots_poly is None

        # Slowest (statsmodels) version
        _, boots_smod = p.fit_statsmodels(self.grid, smlm.OLS)
        assert boots_smod is None
Exemple #24
0
 def test_singleton(self, x, y):
     p = lm._RegressionPlotter(x, y)
     assert not p.fit_reg
Exemple #25
0
    def test_provided_bins(self):

        p = lm._RegressionPlotter(self.df.x, self.df.y)
        x_binned, bins = p.bin_predictor(self.bins_given)
        npt.assert_array_equal(np.unique(x_binned), self.bins_given)
Exemple #26
0
    def test_bin_results(self):

        p = lm._RegressionPlotter(self.df.x, self.df.y)
        x_binned, bins = p.bin_predictor(self.bins_given)
        assert self.df.x[x_binned == 0].min() > self.df.x[x_binned == -1].max()
        assert self.df.x[x_binned == 1].min() > self.df.x[x_binned == 0].max()
Exemple #27
0
    def test_numeric_bins(self):

        p = lm._RegressionPlotter(self.df.x, self.df.y)
        x_binned, bins = p.bin_predictor(self.bins_numeric)
        npt.assert_equal(len(bins), self.bins_numeric)
        npt.assert_array_equal(np.unique(x_binned), bins)
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
from bokeh.plotting import figure, show, output_file
from bokeh.models import HoverTool
from seaborn.regression import _RegressionPlotter

tips = sns.load_dataset("tips")
tips.sort_values(by='total_bill', inplace=True)

regplot = _RegressionPlotter('total_bill', 'tip', data=tips)
grid, yhat, err_bands = regplot.fit_regression(grid=tips.total_bill)

tips['yhat'] = yhat
tips['ci1'] = err_bands[0]
tips['ci2'] = err_bands[1]

hover = HoverTool(tooltips=[
    ("(x, y)", "($x, $y)"),
])
tools = [hover, 'pan', 'wheel_zoom']

p = figure(title="Bokeh Regplot", toolbar_location='right', tools=tools)

p.scatter('total_bill', 'tip', source=tips)
p.line('total_bill', 'yhat', source=tips, line_width=2, line_color='grey')
p.line('total_bill',
       'ci1',
       source=tips,
       alpha=0.7,