Example #1
0
File: ch5.py Project: syting/esl
def figure_5_3():
    """ Reproduces figure 5.3 in ESLii displaying the pointwise variance curves
    for different spline bases
    """
    def pv(D):
        cv = np.linalg.inv(np.dot(D.transpose(), D))
        return [np.dot(D[i, :].transpose(), np.dot(cv, D[i, :]))
                for i in range(D.shape[0])]
    n = 50
    X = np.sort([random.random() for i in range(n)])
    l_X = np.ndarray((n, 2))
    l_X[:, 0] = np.ones(n)
    l_X[:, 1] = X
    l_pv = pv(l_X)
    c_X = np.ndarray((n, 4))
    c_X[:, 0:2] = l_X
    c_X[:, 2] = [x**2 for x in X]
    c_X[:, 3] = [x**3 for x in X]
    c_pv = pv(c_X)
    cs_X = np.ndarray((n, 6))
    cs_X[:, 0:4] = c_X
    cs_X[:, 4] = [max(0, (x - .33)**3) for x in X]
    cs_X[:, 5] = [max(0, (x - .66)**3) for x in X]
    cs_pv = pv(cs_X)

    ncs_X = splines.ns_basis(X, knots=[0.1, 0.26, 0.42, 0.58, 0.74, 0.9],
                             intercept=True)
    ncs_pv = pv(ncs_X)
    plt.plot(X, l_pv)
    plt.plot(X, c_pv)
    plt.plot(X, cs_pv)
    plt.plot(X, ncs_pv)
Example #2
0
File: ch5.py Project: syting/esl
def figure_5_4():
    """Reproduces figure 5.4 in ESLii displaying the fitted natural spline for
    each term
    """
    data = eslii.read_sa_heart_data()
    data.drop(["adiposity", "typea", "alcohol"], axis=1, inplace=True)
    y = data["chd"]
    X = data.drop("chd", axis=1)
    X["famhist"] = pandas.get_dummies(X["famhist"])["Present"]
    N = np.ndarray((X.shape[0], 21))

    q = [0, 25, 50, 75, 100]
    N[:, 0:4] = splines.ns_basis(X["sbp"],
                                 knots=np.percentile(X["sbp"], q),
                                 intercept=False)
    N[:, 4:8] = splines.ns_basis(X["tobacco"],
                                 knots=np.percentile(X["tobacco"], q),
                                 intercept=False)
    N[:, 8:12] = splines.ns_basis(X["ldl"],
                                  knots=np.percentile(X["ldl"], q),
                                  intercept=False)
    N[:, 12] = X["famhist"]
    N[:, 13:17] = splines.ns_basis(X["obesity"],
                                   knots=np.percentile(X["obesity"], q),
                                   intercept=False)
    N[:, 17:21] = splines.ns_basis(X["age"],
                                   knots=np.percentile(X["age"], q),
                                   intercept=False)

    lr = LogisticRegression(C=1e50).fit(N, y)
    N -= N.mean(axis=0)

    fig = plt.figure()
    fig.add_subplot(321).scatter(X["sbp"], np.dot(N[:, 0:4], lr.coef_[0][0:4]))
    fig.add_subplot(322).scatter(X["tobacco"], np.dot(N[:, 4:8], lr.coef_[0][4:8]))
    fig.add_subplot(323).scatter(X["ldl"], np.dot(N[:, 8:12], lr.coef_[0][8:12]))
    fig.add_subplot(324).scatter(X["famhist"], np.dot(N[:, 12:13], lr.coef_[0][12:13]))
    fig.add_subplot(325).scatter(X["obesity"], np.dot(N[:, 13:17], lr.coef_[0][13:17]))
    fig.add_subplot(326).scatter(X["age"], np.dot(N[:, 17:21], lr.coef_[0][17:21]))
    plt.show()
Example #3
0
File: ch5.py Project: syting/esl
def figure_5_5():
    """Reproduces figure 5.5 in ESLii displaying the results of fitting a spline
    to the phoneme classification result
    """
    phoneme = eslii.read_phoneme_data()
    aa = phoneme[phoneme['g'] == 'aa']
    aa_train = aa[map(lambda s: s.find("train") == 0, aa['speaker'])]
    aa_train = aa_train.reset_index()
    aa_test = aa[map(lambda s: s.find("test") == 0, aa['speaker'])]
    aa_test = aa_test.reset_index()
    ao = phoneme[phoneme['g'] == 'ao']
    ao_train = ao[map(lambda s: s.find("train") == 0, ao['speaker'])]
    ao_train = ao_train.reset_index()
    ao_test = ao[map(lambda s: s.find("test") == 0, ao['speaker'])]
    ao_test = ao_test.reset_index()

    # Print some examples of the data
    fit = plt.figure()
    for i in range(15):
        fit.add_subplot(211).plot(range(1, 257), aa_train.ix[i][1:257], c='green')
        fit.add_subplot(211).plot(range(1, 257), ao_train.ix[i][1:257], c='red')

    # Separate out train and test data/labels
    train_X = np.concatenate((aa_train[aa_train.columns[1:257]],
                              ao_train[ao_train.columns[1:257]]))
    train_y = [0 if i < aa_train.shape[0]
               else 1 for i in range(train_X.shape[0])]
    test_X = np.concatenate((aa_test[aa_test.columns[1:257]],
                             ao_test[ao_test.columns[1:257]]))
    test_y = [0 if i < aa_test.shape[0] else 1 for i in range(test_X.shape[0])]

    # Train raw classifier
    lr = LogisticRegression(C=1e50).fit(train_X, train_y)
    print "Raw errors: {:.2f} {:.2f}".format(1 - lr.score(train_X, train_y),
                                             1 - lr.score(test_X, test_y))

    # Train regularized classifier
    N = splines.ns_basis(range(1, 257), [1, 21, 42, 64, 85, 106, 128, 149, 170,
                                         192, 213, 234, 256])
    lr2 = LogisticRegression(C=1e50).fit(np.dot(train_X, N), train_y)
    print "Reg errors: {:.2f} {:.2f}".format(1 - lr2.score(np.dot(train_X, N),
                                                           train_y),
                                             1 - lr2.score(np.dot(test_X, N),
                                                           test_y))

    fit.add_subplot(212).plot(range(1, 257), lr.coef_[0],
                              range(1, 257), np.dot(N, lr2.coef_[0]))