def figure_6_14(): """Reproduces figure 6.14 in ESLii displaying a density estimate for sbp levels in chd/no-chd groups using a Gaussian kernel density estimate """ sa = eslii.read_sa_heart_data() sbp = sa["sbp"] sbp_chd = sa[sa["chd"] == 1]["sbp"].copy() sbp_chd.sort() sbp_no_chd = sa[sa["chd"] == 0]["sbp"].copy() sbp_no_chd.sort() kde_chd = KernelDensity(kernel='gaussian', bandwidth=7.5).fit( sbp_chd.reshape(len(sbp_chd), 1)) chd_log_dens = kde_chd.score_samples(sbp_chd.reshape((len(sbp_chd), 1))) plt.subplot(121) plt.plot(sbp_chd, np.exp(chd_log_dens), label="CHD") kde_no_chd = KernelDensity(kernel='gaussian', bandwidth=7.5).fit( sbp_no_chd.reshape(len(sbp_no_chd), 1)) no_chd_log_dens = kde_no_chd.score_samples( sbp_no_chd.reshape((len(sbp_no_chd), 1))) plt.plot(sbp_no_chd, np.exp(no_chd_log_dens), label="no CHD") plt.legend(loc='best') sbp_range = np.linspace(min(sbp), max(sbp), 100).reshape((100, 1)) chd_dens = np.exp(kde_chd.score_samples(sbp_range)) no_chd_dens = np.exp(kde_no_chd.score_samples(sbp_range)) p_chd = float(len(sbp_chd))/(len(sbp_chd) + len(sbp_no_chd)) posterior_est = [p_chd * chd_dens[i] / (p_chd * chd_dens[i] + (1 - p_chd) * no_chd_dens[i]) for i in range(len(sbp_range))] plt.subplot(122) plt.plot(sbp_range, posterior_est) plt.show()
def figure_4_13(): """Reproduces figure 4.13 in ESLii showing the coefficients of an L1-regularized logistic regression fit to the South African heart disease data as a function of the L1 length of beta TODO: this doesn't match """ data = eslii.read_sa_heart_data() data.drop([u"adiposity", u"typea"], axis=1, inplace=True) y = data["chd"] X = data.drop("chd", axis=1) X["famhist"] = pandas.get_dummies(X["famhist"])["Present"] X = eslii.standardize_data(X, demeanCols=[]) beta_norms = [] coefs = {} for column in X.columns: coefs[column] = [] alphas = [1e-3, 1e-2, 2e-2, 3e-2, 4e-2, 5e-2, 6e-2, 7e-2, 8e-2, 9e-2, 1e-1, .5, 1.0, 10.0] for alpha in alphas: lr = LogisticRegression(penalty="l1", C=alpha).fit(X, y) beta_norms.append(sum(abs(lr.coef_[0]))) for (i, column) in enumerate(X.columns): coefs[column].append(lr.coef_[0][i]) for column in X.columns: plt.plot(beta_norms, coefs[column])
def tables_4_2_and_4_3(): """Reproduces table 4.2 and 4.3 in ESLii showing the results of a logistic regression fit to selected predictors of the South African heart disease data """ data = eslii.read_sa_heart_data() data.drop([u"adiposity", u"typea"], axis=1, inplace=True) y = data["chd"] X = data.drop("chd", axis=1) X["famhist"] = pandas.get_dummies(X["famhist"])["Present"] lr = LogisticRegression(C=1e30).fit(X, y) print "(Intercept) {:.3f}".format(lr.intercept_[0]) for (i, column) in enumerate(X.columns): print "{} {:.3f}".format(column, lr.coef_[0][i]) print "\n" X.drop(["sbp", "obesity", "alcohol"], axis=1, inplace=True) lr = LogisticRegression(C=1e30).fit(X, y) print "(Intercept) {:.3f}".format(lr.intercept_[0]) for (i, column) in enumerate(X.columns): print "{} {:.3f}".format(column, lr.coef_[0][i])
def figure_5_4(): """Reproduces figure 5.4 in ESLii displaying the fitted natural spline for each term """ data = eslii.read_sa_heart_data() data.drop(["adiposity", "typea", "alcohol"], axis=1, inplace=True) y = data["chd"] X = data.drop("chd", axis=1) X["famhist"] = pandas.get_dummies(X["famhist"])["Present"] N = np.ndarray((X.shape[0], 21)) q = [0, 25, 50, 75, 100] N[:, 0:4] = splines.ns_basis(X["sbp"], knots=np.percentile(X["sbp"], q), intercept=False) N[:, 4:8] = splines.ns_basis(X["tobacco"], knots=np.percentile(X["tobacco"], q), intercept=False) N[:, 8:12] = splines.ns_basis(X["ldl"], knots=np.percentile(X["ldl"], q), intercept=False) N[:, 12] = X["famhist"] N[:, 13:17] = splines.ns_basis(X["obesity"], knots=np.percentile(X["obesity"], q), intercept=False) N[:, 17:21] = splines.ns_basis(X["age"], knots=np.percentile(X["age"], q), intercept=False) lr = LogisticRegression(C=1e50).fit(N, y) N -= N.mean(axis=0) fig = plt.figure() fig.add_subplot(321).scatter(X["sbp"], np.dot(N[:, 0:4], lr.coef_[0][0:4])) fig.add_subplot(322).scatter(X["tobacco"], np.dot(N[:, 4:8], lr.coef_[0][4:8])) fig.add_subplot(323).scatter(X["ldl"], np.dot(N[:, 8:12], lr.coef_[0][8:12])) fig.add_subplot(324).scatter(X["famhist"], np.dot(N[:, 12:13], lr.coef_[0][12:13])) fig.add_subplot(325).scatter(X["obesity"], np.dot(N[:, 13:17], lr.coef_[0][13:17])) fig.add_subplot(326).scatter(X["age"], np.dot(N[:, 17:21], lr.coef_[0][17:21])) plt.show()