def test_bootstrap_seed(random): """Test that we can get reproducible resamples by seeding the RNG.""" data = np.random.randn(50) seed = 42 boots1 = algo.bootstrap(data, seed=seed) boots2 = algo.bootstrap(data, seed=seed) assert_array_equal(boots1, boots2)
def test_bootstrap(random): """Test that bootstrapping gives the right answer in dumb cases.""" a_ones = np.ones(10) n_boot = 5 out1 = algo.bootstrap(a_ones, n_boot=n_boot) assert_array_equal(out1, np.ones(n_boot)) out2 = algo.bootstrap(a_ones, n_boot=n_boot, func=np.median) assert_array_equal(out2, np.ones(n_boot))
def test_bootstrap_length(random): """Test that we get a bootstrap array of the right shape.""" a_norm = np.random.randn(1000) out = algo.bootstrap(a_norm) assert len(out) == 10000 n_boot = 100 out = algo.bootstrap(a_norm, n_boot=n_boot) assert len(out) == n_boot
def test_bootstrap_axis(random): """Test axis kwarg to bootstrap function.""" x = np.random.randn(10, 20) n_boot = 100 out_default = algo.bootstrap(x, n_boot=n_boot) assert out_default.shape == (n_boot, ) out_axis = algo.bootstrap(x, n_boot=n_boot, axis=0) assert out_axis.shape, (n_boot, x.shape[1])
def test_bootstrap_reproducibility(random): """Test that bootstrapping uses the internal random state.""" data = np.random.randn(50) boots1 = algo.bootstrap(data, seed=100) boots2 = algo.bootstrap(data, seed=100) assert_array_equal(boots1, boots2) with pytest.warns(UserWarning): # Deprecatd, remove when removing random_seed boots1 = algo.bootstrap(data, random_seed=100) boots2 = algo.bootstrap(data, random_seed=100) assert_array_equal(boots1, boots2)
def test_bootstrap_units(random): """Test that results make sense when passing unit IDs to bootstrap.""" data = np.random.randn(50) ids = np.repeat(range(10), 5) bwerr = np.random.normal(0, 2, 10) bwerr = bwerr[ids] data_rm = data + bwerr seed = 77 boots_orig = algo.bootstrap(data_rm, seed=seed) boots_rm = algo.bootstrap(data_rm, units=ids, seed=seed) assert boots_rm.std() > boots_orig.std()
def test_bootstrap_range(random): """Test that bootstrapping a random array stays within the right range.""" a_norm = np.random.randn(1000) amin, amax = a_norm.min(), a_norm.max() out = algo.bootstrap(a_norm) assert amin <= out.min() assert amax >= out.max()
def bootstrapped_ci(x, func, n_boot, which_ci=95, axis=None): """ Get the confidence interval (CI) of a metric using bootstrapping. Parameters ---------- x : array-like a sample. func : callable (function object) the function that estimated the metric (for example np.mean, np.median, ...). n_boot : int number of sub-samples to use for the bootstrap estimate. which_ci : float, optional A number between 0 and 100 that defines the confidence interval. The default is 95, which means that there is 95% probability the metric will be inside the limits of the confidence interval. axis : int or None, optional Will pass axis to func as a keyword argument. The default is None. Returns ------- TYPE DESCRIPTION. """ from seaborn.algorithms import bootstrap from seaborn.utils import ci boot_distribution = bootstrap(x, func=func, n_boot=n_boot, axis=axis) return ci(boot_distribution, which=which_ci, axis=axis)
def test_nanaware_func_warning(random): x = np.random.normal(size=10) x[0] = np.nan with pytest.warns(UserWarning, match="Data contain nans but"): boots = algo.bootstrap(x, func="ptp") assert np.isnan(boots).any()
def bootstrapped_cis(vals): if len(vals) <= 1: return null_ci boots = bootstrap(vals, func=func, n_boot=n_boot, seed=seed) cis = utils.ci(boots, ci) return pd.Series(cis, ["low", "high"])
def test_bootstrap_multiarg(random): """Test that bootstrap works with multiple input arrays.""" x = np.vstack([[1, 10] for i in range(10)]) y = np.vstack([[5, 5] for i in range(10)]) def f(x, y): return np.vstack((x, y)).max(axis=0) out_actual = algo.bootstrap(x, y, n_boot=2, func=f) out_wanted = np.array([[5, 10], [5, 10]]) assert_array_equal(out_actual, out_wanted)
def test_bootstrap_ols(random): """Test bootstrap of OLS model fit.""" def ols_fit(X, y): XtXinv = np.linalg.inv(np.dot(X.T, X)) return XtXinv.dot(X.T).dot(y) X = np.column_stack((np.random.randn(50, 4), np.ones(50))) w = [2, 4, 0, 3, 5] y_noisy = np.dot(X, w) + np.random.randn(50) * 20 y_lownoise = np.dot(X, w) + np.random.randn(50) n_boot = 500 w_boot_noisy = algo.bootstrap(X, y_noisy, n_boot=n_boot, func=ols_fit) w_boot_lownoise = algo.bootstrap(X, y_lownoise, n_boot=n_boot, func=ols_fit) assert w_boot_noisy.shape == (n_boot, 5) assert w_boot_lownoise.shape == (n_boot, 5) assert w_boot_noisy.std() > w_boot_lownoise.std()
def get_params(self, grid): """Low-level regression and prediction. Adapted from seaborn.""" def reg_func(x_, y_): return np.linalg.pinv(x_).dot(y_) x, y = np.c_[np.ones(len(self.x)), self.x], self.y grid = np.c_[np.ones(len(grid)), grid] beta_plot = reg_func(x, y) yhat = grid.dot(beta_plot) if self.ci is None: return yhat, None beta_boots = sns_algos.bootstrap( x, y, func=reg_func, n_boot=self.n_boot, units=self.units, # pytype: disable=attribute-error seed=self.seed).T return beta_plot, beta_boots
def test_bootstrap_string_func(): """Test that named numpy methods are the same as the numpy function.""" x = np.random.randn(100) res_a = algo.bootstrap(x, func="mean", seed=0) res_b = algo.bootstrap(x, func=np.mean, seed=0) assert np.array_equal(res_a, res_b) res_a = algo.bootstrap(x, func="std", seed=0) res_b = algo.bootstrap(x, func=np.std, seed=0) assert np.array_equal(res_a, res_b) with pytest.raises(AttributeError): algo.bootstrap(x, func="not_a_method_name")
def fit_logx(self, grid): """Fit the model in log-space.""" X, y = np.c_[np.ones(len(self.x)), self.x], self.y grid = np.c_[np.ones(len(grid)), np.log(grid)] def reg_func(_x, _y): _x = np.c_[_x[:, 0], np.log(_x[:, 1])] _y = np.log(_y) return np.linalg.pinv(_x).dot(_y) self.betas = reg_func(X, y) yhat = grid.dot(self.betas) if self.ci is None: return np.exp(yhat), None beta_boots = algo.bootstrap(X, y, func=reg_func, n_boot=self.n_boot, units=self.units).T yhat_boots = grid.dot(beta_boots).T return np.exp(yhat), np.exp(yhat_boots)
def test_nanaware_func_auto(random): x = np.random.normal(size=10) x[0] = np.nan boots = algo.bootstrap(x, func="mean") assert not np.isnan(boots).any()
def test_bootstrap_arglength(): """Test that different length args raise ValueError.""" with pytest.raises(ValueError): algo.bootstrap(np.arange(5), np.arange(10))
def fit_scale_heights(data, masks, min_lat=None, max_lat=None, deredden=False, fig_names=None, return_smoothed=False, smoothed_width=None, xlim=None, ylim=None, robust=True, n_boot=10000): """ Fits scale height data and returns slopes Parameters ---------- data: `skySurvey` WHAM skySurvey object of full sky (requires track keyword), or spiral arm section masks: `list like` longitude masks to use min_lat: `u.Quantity` min latitude to fit max_lat: `u.Quantity` max latitude to fit deredden: `bool` if True, also fits dereddened slopes fig_names: `str` if provided, saves figures following this name return_smoothed: `bool` if True, returns smoothed longitude and slope estimates smoothed_width: `u.Quantity` width to smooth data to in longitude robust: `bool` if True, uses stats.models.robust_linear_model n_boot: `int` only if robust = True number of bootstrap resamples """ # Default values if min_lat is None: min_lat = 5 * u.deg elif not hasattr(min_lat, "unit"): min_lat *= u.deg if max_lat is None: max_lat = 35 * u.deg elif not hasattr(max_lat, "unit"): max_lat *= u.deg if smoothed_width is None: smoothed_width = 5 * u.deg elif not hasattr(smoothed_width, "unit"): smoothed_width *= u.deg #initialize data arrays slopes_pos = [] slopes_neg = [] slopes_pos_dr = [] slopes_neg_dr = [] intercept_pos = [] intercept_neg = [] intercept_pos_dr = [] intercept_neg_dr = [] slopes_pos_err = [] slopes_neg_err = [] slopes_pos_dr_err = [] slopes_neg_dr_err = [] intercept_pos_err = [] intercept_neg_err = [] intercept_pos_dr_err = [] intercept_neg_dr_err = [] median_longitude = [] median_distance = [] for ell2 in range(len(masks)): xx = data["tan(b)"][masks[ell2]] yy = np.log(data["INTEN"][masks[ell2]]) nan_mask = np.isnan(yy) nan_mask |= np.isinf(yy) if deredden: zz = np.log(data["INTEN_DERED"][masks[ell2]]) nan_mask_z = np.isnan(zz) nan_mask_z |= np.isinf(zz) median_longitude.append(np.median(data["GAL-LON"][masks[ell2]])) if deredden: median_distance.append(np.median(data["DISTANCE"][masks[ell2]])) y_min = np.tan(min_lat) y_max = np.tan(max_lat) if not robust: if hasattr(stats, "siegelslopes"): slope_estimator = stats.siegelslopes else: logging.warning( "Installed version of scipy does not have the siegelslopes method in scipy.stats!" ) slope_estimator = stats.theilslopes siegel_result_pos = slope_estimator( yy[(xx > y_min) & (xx < y_max) & ~nan_mask], xx[(xx > y_min) & (xx < y_max) & ~nan_mask]) siegel_result_neg = slope_estimator( yy[(xx < -y_min) & (xx > -y_max) & ~nan_mask], xx[(xx < -y_min) & (xx > -y_max) & ~nan_mask]) if deredden: siegel_result_pos_dr = slope_estimator( zz[(xx > y_min) & (xx < y_max) & ~nan_mask_z], xx[(xx > y_min) & (xx < y_max) & ~nan_mask_z]) siegel_result_neg_dr = slope_estimator( zz[(xx < -y_min) & (xx > -y_max) & ~nan_mask_z], xx[(xx < -y_min) & (xx > -y_max) & ~nan_mask_z]) slopes_pos.append(siegel_result_pos[0]) slopes_neg.append(siegel_result_neg[0]) intercept_pos.append(siegel_result_pos[1]) intercept_neg.append(siegel_result_neg[1]) if deredden: slopes_pos_dr.append(siegel_result_pos_dr[0]) slopes_neg_dr.append(siegel_result_neg_dr[0]) intercept_pos_dr.append(siegel_result_pos_dr[1]) intercept_neg_dr.append(siegel_result_neg_dr[1]) if fig_names is not None: figure_name = "{0}_{1}.png".format(fig_names, ell2) if xlim is None: xlim = np.array([-0.9, 0.9]) if ylim is None: ylim = np.array([-4.6, 3.2]) fig = plt.figure() ax = fig.add_subplot(111) ax2 = ax.twiny() ax.scatter(xx, yy, color="k", alpha=0.8) if deredden: ax.scatter(xx, zz, color="grey", alpha=0.8) ax.set_xlabel(r"$\tan$(b)", fontsize=12) ax.set_ylabel(r"$\log$($H\alpha$ Intensity / R)", fontsize=12) ax.set_title(r"${0:.1f} < l < {1:.1f}$".format( data["GAL-LON"][masks[ell2]].min(), data["GAL-LON"][masks[ell2]].max()), fontsize=14) ax2.plot(np.degrees(np.arctan(xlim)), np.log([0.1, 0.1]), ls=":", lw=1, color="k", label="0.1 R") ax2.fill_between([-min_lat, min_lat] * u.deg, [ylim[0], ylim[0]], [ylim[1], ylim[1]], color=pal[1], alpha=0.1, label=r"$|b| < 5\degree$") line_xx = np.linspace(y_min, y_max, 10) line_yy_pos = siegel_result_pos[ 0] * line_xx + siegel_result_pos[1] line_yy_neg = siegel_result_neg[ 0] * -line_xx + siegel_result_neg[1] ax.plot(line_xx, line_yy_pos, color="r", lw=3, alpha=0.9, label=r"$H_{{n_e^2}} = {0:.2f} D$".format( 1 / -siegel_result_pos[0])) ax.plot(-line_xx, line_yy_neg, color="b", lw=3, alpha=0.9, label=r"$H_{{n_e^2}} = {0:.2f} D$".format( 1 / siegel_result_neg[0])) if deredden: line_yy_pos_dr = siegel_result_pos_dr[ 0] * line_xx + siegel_result_pos_dr[1] line_yy_neg_dr = siegel_result_neg_dr[ 0] * -line_xx + siegel_result_neg_dr[1] ax.plot(line_xx, line_yy_pos_dr, color="r", lw=3, alpha=0.9, ls="--", label=r"Dered: $H_{{n_e^2}} = {0:.2f} D$".format( 1 / -siegel_result_pos_dr[0])) ax.plot(-line_xx, line_yy_neg_dr, color="b", lw=3, alpha=0.9, ls="--", label=r"Dered: $H_{{n_e^2}} = {0:.2f} D$".format( 1 / siegel_result_neg_dr[0])) ax.set_xlim(xlim) ax.set_ylim(ylim) ax2.set_xlabel(r"$b$ (deg)", fontsize=12) ax2.set_xlim(np.degrees(np.arctan(xlim))) ax.legend(fontsize=12, loc=1) ax2.legend(fontsize=12, loc=2) plt.tight_layout() plt.savefig(figure_name, dpi=300) del (fig) plt.close() results = { "median_longitude": np.array(median_longitude), "slopes_pos": np.array(slopes_pos), "slopes_neg": np.array(slopes_neg), "intercept_pos": np.array(intercept_pos), "intercept_neg": np.array(intercept_neg) } if deredden: results["median_distance"] = np.array(median_distance), results["slopes_pos_dr"] = np.array(slopes_pos_dr) results["slopes_neg_dr"] = np.array(slopes_neg_dr) results["intercept_pos_dr"] = np.array(intercept_pos_dr) results["intercept_neg_dr"] = np.array(intercept_neg_dr) else: yy_pos = yy[(xx > y_min) & (xx < y_max) & ~nan_mask] xx_pos = xx[(xx > y_min) & (xx < y_max) & ~nan_mask] yy_neg = yy[(xx < -y_min) & (xx > -y_max) & ~nan_mask] xx_neg = xx[(xx < -y_min) & (xx > -y_max) & ~nan_mask] if ((len(yy_pos) < 5) | (len(yy_neg) < 5)): slopes_pos.append(np.mean(boot_pos[:, 1], axis=0)) slopes_neg.append(np.mean(boot_neg[:, 1], axis=0)) slopes_pos_err.append(np.std(boot_pos[:, 1], axis=0)) slopes_neg_err.append(np.std(boot_neg[:, 1], axis=0)) intercept_pos.append(np.mean(boot_pos[:, 0], axis=0)) intercept_neg.append(np.mean(boot_neg[:, 0], axis=0)) intercept_pos_err.append(np.std(boot_pos[:, 0], axis=0)) intercept_neg_err.append(np.std(boot_neg[:, 0], axis=0)) else: if deredden: zz_dr_pos = zz[(xx > y_min) & (xx < y_max) & ~nan_mask_z] xx_dr_pos = xx[(xx > y_min) & (xx < y_max) & ~nan_mask_z] zz_dr_neg = zz[(xx < -y_min) & (xx > -y_max) & ~nan_mask_z] xx_dr_neg = xx[(xx < -y_min) & (xx > -y_max) & ~nan_mask_z] def slope_int_estimator_pos_dr(inds, YY=zz_dr_pos, XX=xx_dr_pos): """ estimate slope using sm.RLM """ XX = XX[inds] YY = YY[inds] XX = sm.add_constant(XX) res = sm.RLM(YY, XX, M=sm.robust.norms.HuberT()).fit() return res.params def slope_int_estimator_neg_dr(inds, YY=zz_dr_neg, XX=xx_dr_neg): """ estimate slope using sm.RLM """ XX = XX[inds] YY = YY[inds] XX = sm.add_constant(XX) res = sm.RLM(YY, XX, M=sm.robust.norms.HuberT()).fit() return res.params def slope_int_estimator_pos(inds, YY=yy_pos, XX=xx_pos): """ estimate slope using sm.RLM """ XX = XX[inds] YY = YY[inds] XX = sm.add_constant(XX) res = sm.RLM(YY, XX, M=sm.robust.norms.HuberT()).fit() return res.params def slope_int_estimator_neg(inds, YY=yy_neg, XX=xx_neg): """ estimate slope using sm.RLM """ XX = XX[inds] YY = YY[inds] XX = sm.add_constant(XX) res = sm.RLM(YY, XX, M=sm.robust.norms.HuberT()).fit() return res.params boot_pos = bootstrap(np.arange(len(yy_pos)), func=slope_int_estimator_pos, n_boot=n_boot) boot_neg = bootstrap(np.arange(len(yy_neg)), func=slope_int_estimator_neg, n_boot=n_boot) slopes_pos.append(np.mean(boot_pos[:, 1], axis=0)) slopes_neg.append(np.mean(boot_neg[:, 1], axis=0)) slopes_pos_err.append(np.std(boot_pos[:, 1], axis=0)) slopes_neg_err.append(np.std(boot_neg[:, 1], axis=0)) intercept_pos.append(np.mean(boot_pos[:, 0], axis=0)) intercept_neg.append(np.mean(boot_neg[:, 0], axis=0)) intercept_pos_err.append(np.std(boot_pos[:, 0], axis=0)) intercept_neg_err.append(np.std(boot_neg[:, 0], axis=0)) if deredden: boot_pos_dr = bootstrap(np.arange(len(zz_dr_pos)), func=slope_int_estimator_pos_dr, n_boot=n_boot) boot_neg_dr = bootstrap(np.arange(len(zz_dr_neg)), func=slope_int_estimator_neg_dr, n_boot=n_boot) slopes_pos_dr.append(np.mean(boot_pos_dr[:, 1], axis=0)) slopes_neg_dr.append(np.mean(boot_neg_dr[:, 1], axis=0)) slopes_pos_dr_err.append(np.std(boot_pos_dr[:, 1], axis=0)) slopes_neg_dr_err.append(np.std(boot_neg_dr[:, 1], axis=0)) intercept_pos_dr.append(np.mean(boot_pos_dr[:, 0], axis=0)) intercept_neg_dr.append(np.mean(boot_neg_dr[:, 0], axis=0)) intercept_pos_dr_err.append( np.std(boot_pos_dr[:, 0], axis=0)) intercept_neg_dr_err.append( np.std(boot_neg_dr[:, 0], axis=0)) if fig_names is not None: figure_name = "{0}_{1}.png".format(fig_names, ell2) if xlim is None: xlim = np.array([-0.9, 0.9]) if ylim is None: ylim = np.array([-4.6, 3.2]) fig = plt.figure() ax = fig.add_subplot(111) ax2 = ax.twiny() ax.scatter(xx, yy, color="k", alpha=0.8) if deredden: ax.scatter(xx, zz, color="grey", alpha=0.8) ax.set_xlabel(r"$\tan$(b)", fontsize=12) ax.set_ylabel(r"$\log$($H\alpha$ Intensity / R)", fontsize=12) ax.set_title(r"${0:.1f} < l < {1:.1f}$".format( data["GAL-LON"][masks[ell2]].min(), data["GAL-LON"][masks[ell2]].max()), fontsize=14) ax2.plot(np.degrees(np.arctan(xlim)), np.log([0.1, 0.1]), ls=":", lw=1, color="k", label="0.1 R") ax2.fill_between([-min_lat, min_lat] * u.deg, [ylim[0], ylim[0]], [ylim[1], ylim[1]], color=pal[1], alpha=0.1, label=r"$|b| < 5\degree$") line_xx = np.linspace(y_min, y_max, 100) def get_slope_conf_band(boot_res, X=line_xx): yy = [[res[0] + res[1] * X] for res in boot_res] yy = np.vstack(yy) return np.percentile(yy, (5, 95), axis=0) line_yy_pos = slopes_pos[-1] * line_xx + intercept_pos[-1] line_yy_neg = slopes_neg[-1] * -line_xx + intercept_neg[-1] line_yy_pos_range = get_slope_conf_band(boot_pos) line_yy_neg_range = get_slope_conf_band(boot_neg, X=-line_xx) ax.plot(line_xx, line_yy_pos, color="r", lw=3, alpha=0.9, label=r"$H_{{n_e^2}} = ({0:.2f} \pm {1:.2f}) D$". format( 1 / -slopes_pos[-1], np.abs(1 / slopes_pos[-1] * slopes_pos_err[-1] / slopes_pos[-1]))) ax.fill_between(line_xx, line_yy_pos_range[0], line_yy_pos_range[1], color="r", alpha=0.2) ax.plot(-line_xx, line_yy_neg, color="b", lw=3, alpha=0.9, label=r"$H_{{n_e^2}} = ({0:.2f} \pm {1:.2f}) D$". format( 1 / slopes_neg[-1], np.abs(-1 / slopes_pos[-1] * slopes_pos_err[-1] / slopes_pos[-1]))) ax.fill_between(-line_xx, line_yy_neg_range[0], line_yy_neg_range[1], color="b", alpha=0.2) if deredden: line_yy_pos_dr = slopes_pos_dr[ -1] * line_xx + intercept_pos_dr[-1] line_yy_neg_dr = slopes_neg_dr[ -1] * -line_xx + intercept_neg_dr[-1] line_yy_pos_range_dr = get_slope_conf_band(boot_pos_dr) line_yy_neg_range_dr = get_slope_conf_band(boot_neg_dr, X=-line_xx) ax.plot( line_xx, line_yy_pos_dr, color="r", lw=3, alpha=0.9, ls="--", label= r"Dered: $H_{{n_e^2}} = ({0:.2f} \pm {1:.2f}) D$". format( 1 / -slopes_pos_dr[-1], np.abs(1 / slopes_pos_dr[-1] * slopes_pos_dr_err[-1] / slopes_pos_dr[-1]))) ax.fill_between(line_xx, line_yy_pos_range_dr[0], line_yy_pos_range_dr[1], color="r", alpha=0.2) ax.plot( -line_xx, line_yy_neg_dr, color="b", lw=3, alpha=0.9, ls="--", label= r"Dered: $H_{{n_e^2}} = ({0:.2f} \pm {1:.2f}) D$". format( 1 / slopes_neg_dr[-1], np.abs(-1 / slopes_pos_dr[-1] * slopes_pos_dr_err[-1] / slopes_pos_dr[-1]))) ax.fill_between(-line_xx, line_yy_neg_range_dr[0], line_yy_neg_range_dr[1], color="b", alpha=0.2) ax.set_xlim(xlim) ax.set_ylim(ylim) ax2.set_xlabel(r"$b$ (deg)", fontsize=12) ax2.set_xlim(np.degrees(np.arctan(xlim))) ax.legend(fontsize=12, loc=1) ax2.legend(fontsize=12, loc=2) plt.tight_layout() plt.savefig(figure_name, dpi=300) del (fig) plt.close() results = { "median_longitude": np.array(median_longitude), "slopes_pos": np.array(slopes_pos), "slopes_neg": np.array(slopes_neg), "intercept_pos": np.array(intercept_pos), "intercept_neg": np.array(intercept_neg), "slopes_pos_err": np.array(slopes_pos_err), "slopes_neg_err": np.array(slopes_neg_err), "intercept_pos_err": np.array(intercept_pos_err), "intercept_neg_err": np.array(intercept_neg_err) } if deredden: results["median_distance"] = np.array(median_distance), results["slopes_pos_dr"] = np.array(slopes_pos_dr) results["slopes_neg_dr"] = np.array(slopes_neg_dr) results["intercept_pos_dr"] = np.array(intercept_pos_dr) results["intercept_neg_dr"] = np.array(intercept_neg_dr) results["slopes_pos_dr_err"] = np.array(slopes_pos_dr_err) results["slopes_neg_dr_err"] = np.array(slopes_neg_dr_err) results["intercept_pos_dr_err"] = np.array( intercept_pos_dr_err) results["intercept_neg_dr_err"] = np.array( intercept_neg_dr_err) if return_smoothed: results["smoothed_longitude"] = np.arange(np.min(median_longitude), np.max(median_longitude), 0.25) if deredden: distance_interp = interp1d(median_longitude, median_distance) results["smoothed_distance"] = distance_interp( results["smoothed_longitude"]) smoothed_slope_pos_ha = np.zeros( (3, len(results["smoothed_longitude"]))) smoothed_slope_neg_ha = np.zeros( (3, len(results["smoothed_longitude"]))) smoothed_slope_pos_ha_dr = np.zeros( (3, len(results["smoothed_longitude"]))) smoothed_slope_neg_ha_dr = np.zeros( (3, len(results["smoothed_longitude"]))) for ell, lon in enumerate(results["smoothed_longitude"]): smoothed_slope_pos_ha[:, ell] = np.nanpercentile( np.array(slopes_pos) [(median_longitude <= lon + smoothed_width.value / 2) & (median_longitude > lon - smoothed_width.value / 2)], (10, 50, 90)) smoothed_slope_neg_ha[:, ell] = np.nanpercentile( np.array(slopes_neg) [(median_longitude <= lon + smoothed_width.value / 2) & (median_longitude > lon - smoothed_width.value / 2)], (10, 50, 90)) if deredden: smoothed_slope_pos_ha_dr[:, ell] = np.nanpercentile( np.array(slopes_pos_dr) [(median_longitude <= lon + smoothed_width.value / 2) & (median_longitude > lon - smoothed_width.value / 2)], (10, 50, 90)) smoothed_slope_neg_ha_dr[:, ell] = np.nanpercentile( np.array(slopes_neg_dr) [(median_longitude <= lon + smoothed_width.value / 2) & (median_longitude > lon - smoothed_width.value / 2)], (10, 50, 90)) results["smoothed_slopes_pos"] = smoothed_slope_pos_ha results["smoothed_slopes_neg"] = smoothed_slope_neg_ha if deredden: results["smoothed_slopes_pos_dr"] = smoothed_slope_pos_ha_dr results["smoothed_slopes_neg_dr"] = smoothed_slope_neg_ha_dr return results
def regplot(x, y, data=None, model=None, ci=95., scatter_color=None, model_color='k', ax=None, scatter_kws={}, regplot_kws={}, cmap=None, cax=None, clabel=None, xlabel=False, ylabel=False, colorbar=False, **kwargs): if model is None: import statsmodels.api as sm model = sm.OLS from seaborn import utils from seaborn import algorithms as algo if ax is None: fig, ax = plt.subplots() if data is None: _x = x _y = y else: _x = data[x] _y = data[y] grid = np.linspace(_x.min(), _x.max(), 100) X = np.c_[np.ones(len(_x)), _x] G = np.c_[np.ones(len(grid)), grid] results = model(_y, X, **kwargs).fit() def reg_func(xx, yy): yhat = model(yy, xx, **kwargs).fit().predict(G) return yhat yhat = results.predict(G) yhat_boots = algo.bootstrap(X, _y, func=reg_func, n_boot=1000, units=None) err_bands = utils.ci(yhat_boots, ci, axis=0) ax.plot(grid, yhat, color=model_color, **regplot_kws) sc = ax.scatter(_x, _y, c=scatter_color, **scatter_kws) ax.fill_between(grid, *err_bands, facecolor=model_color, alpha=.15) if colorbar: cb = plt.colorbar(mappable=sc, cax=cax, ax=ax) cb.ax.yaxis.set_ticks_position('right') if clabel: cb.set_label(clabel) if xlabel: if isinstance(xlabel, str): ax.set_xlabel(xlabel) else: ax.set_xlabel(x) if ylabel: if isinstance(ylabel, str): ax.set_ylabel(ylabel) else: ax.set_ylabel(y) return results
def estimate_statistic(self, estimator, ci, n_boot): if self.hue_names is None: statistic = [] confint = [] else: statistic = [[] for _ in self.plot_data] confint = [[] for _ in self.plot_data] for i, group_data in enumerate(self.plot_data): # Option 1: we have a single layer of grouping # -------------------------------------------- if self.plot_hues is None: if self.plot_units is None: stat_data = remove_na(group_data) unit_data = None else: unit_data = self.plot_units[i] have = pd.notnull(np.c_[group_data, unit_data]).all(axis=1) stat_data = group_data[have] unit_data = unit_data[have] # Estimate a statistic from the vector of data if not stat_data.size: statistic.append(np.nan) else: statistic.append(estimator(stat_data)) # Get a confidence interval for this estimate if ci is not None: if stat_data.size < 2: confint.append([np.nan, np.nan]) continue if ci == "sd": estimate = estimator(stat_data) sd = np.std(stat_data) confint.append((estimate - sd, estimate + sd)) elif ci == "range": confint.append((np.min(stat_data), np.max(stat_data))) else: boots = bootstrap(stat_data, func=estimator, n_boot=n_boot, units=unit_data) confint.append(utils.ci(boots, ci)) # Option 2: we are grouping by a hue layer # ---------------------------------------- else: for j, hue_level in enumerate(self.hue_names): if not self.plot_hues[i].size: statistic[i].append(np.nan) if ci is not None: confint[i].append((np.nan, np.nan)) continue hue_mask = self.plot_hues[i] == hue_level if self.plot_units is None: stat_data = remove_na(group_data[hue_mask]) unit_data = None else: group_units = self.plot_units[i] have = pd.notnull(np.c_[group_data, group_units]).all(axis=1) stat_data = group_data[hue_mask & have] unit_data = group_units[hue_mask & have] # Estimate a statistic from the vector of data if not stat_data.size: statistic[i].append(np.nan) else: statistic[i].append(estimator(stat_data)) # Get a confidence interval for this estimate if ci is not None: if stat_data.size < 2: confint[i].append([np.nan, np.nan]) continue if ci == "sd": estimate = estimator(stat_data) sd = np.std(stat_data) confint[i].append((estimate - sd, estimate + sd)) elif ci == "range": confint[i].append( (np.min(stat_data), np.max(stat_data))) else: boots = bootstrap(stat_data, func=estimator, n_boot=n_boot, units=unit_data) confint[i].append(utils.ci(boots, ci)) # Save the resulting values for plotting self.statistic = np.array(statistic) self.confint = np.array(confint)
for axs, attributes, titles in zip(axzs, attributes_all, titles_all): for axis, attribute, title in zip(axs, attributes, titles): N = 6 men = [ df[df.hate == "hateful"], df[df.hate == "normal"], df[df.hate_neigh], df[df.normal_neigh], df[df.is_63_2 == True], df[df.is_63_2 == False] ] tmp = [] medians, medians_ci = [], [] averages, averages_ci = [], [] for category in men: boots = bootstrap(category[attribute], func=np.nanmean, n_boot=1000) ci_tmp = ci(boots) average = (ci_tmp[0] + ci_tmp[1]) / 2 ci_average = (ci_tmp[1] - ci_tmp[0]) / 2 averages.append(average) averages_ci.append(ci_average) boots = bootstrap(category[attribute], func=np.nanmedian, n_boot=1000) ci_tmp = ci(boots) median = (ci_tmp[0] + ci_tmp[1]) / 2 ci_median = (ci_tmp[1] - ci_tmp[0]) / 2 medians.append(median) medians_ci.append(ci_median)