Beispiel #1
0
def test_MSE_wo_resamp_OLS():
    ''' Function testing the OLS code without resampling by comparing to scikit-learn. '''

    beta = reg.OLS_fit(X, z_flat)
    z_tilde = reg.OLS_predict(beta, X)
    MSE_own = mean_squared_error(z_flat, z_tilde)

    model = skl.LinearRegression()
    model.fit(X, z_flat)
    z_tilde_skl = model.predict(X)
    MSE_skl = mean_squared_error(z_flat, z_tilde_skl)

    tol = 1E-14
    success = abs(MSE_own - MSE_skl) < tol
    msg = 'Error: The MSE for own OLS code did not match the MSE from scikit-learn.'
    assert success, msg
def plot_best_fit(data, title):
    X = reg.CreateDesignMatrix_X(x,y,n=5)
    beta = reg.OLS_fit(X,data)
    z_tilde = reg.OLS_predict(beta, X)
    z_tilde = z_tilde.reshape(z.shape)
    plot_surface(z_tilde, title)
def OLS_analysis():
    print('Analysis for OLS')

    # matrices for table construction
    MSEs = np.zeros((maxdegree, 4))
    R2s = np.zeros((maxdegree, 3))

    # first column of tables: polynomial degrees
    degrees = [i for i in range(1,maxdegree+1)]
    MSEs[:,0] = degrees
    R2s[:,0] = degrees

    # for bias-variance tradeoff
    error_test = np.zeros(maxdegree)
    error_train = np.zeros(maxdegree)
    bias = np.zeros(maxdegree)
    variance = np.zeros(maxdegree)

    # find error as function of model complexity
    for degree in degrees:
        X = reg.CreateDesignMatrix_X(x,y,n=degree)

        # fitting without resampling
        beta = reg.OLS_fit(X,z_flat)
        z_tilde = reg.OLS_predict(beta, X)

        # MSE
        MSE = mean_squared_error(z_flat,z_tilde)
        MSEs[degree-1][1] = MSE
        # R²
        R_squared = r2_score(z_flat,z_tilde)
        R2s[degree-1][1] = R_squared

        #confidence interval of betas
        uppers, lowers = reg.CI_beta(X, beta)

        # fitting with resampling of data
        X_train, X_test, z_train, z_test = train_test_split(X, z_flat, test_size = 0.33)

        beta_train = reg.OLS_fit(X_train, z_train)
        z_tilde = reg.OLS_predict(beta_train, X_test)

        # MSE
        MSE = mean_squared_error(z_test,z_tilde)
        MSEs[degree-1][2] = MSE
        # R²
        R_squared = r2_score(z_test,z_tilde)
        R2s[degree-1][2] = R_squared


        # fitting with k-fold cross validation
        MSE_kf = reg.k_fold_cross_validation(
                X,z_flat, 5, reg.OLS_fit, reg.OLS_predict)[0]
        MSEs[degree-1][3] = MSE_kf

        # train vs test and bias/variance calculations using bootstrap
        e, e2, b, v = reg.bootstrap(X, z_flat, 100, fit_type=reg.OLS_fit, predict_type= reg.OLS_predict)
        error_test[degree-1] = e
        error_train[degree-1] = e2
        bias[degree-1] = b
        variance[degree-1] = v

        # train vs test using CV
        #e, e2 = reg.k_fold_cross_validation(X, z_flat, k=5, fit_type=reg.OLS_fit, predict_type= reg.OLS_predict, tradeoff = False)
        #error_test[degree-1] = e
        #error_train[degree-1] = e2

        # plot confidence interval for betas only for last degree
        if degree == maxdegree:
            sns.set();
            plt.fill_between(np.arange(len(beta)), uppers, lowers, label='95% confidence interval', color=(.5,.5,.5,.2))
            plt.plot(beta, label='beta', color="r")
            plt.title('95 percent Confidence Interval for Beta from OLS when Degree=5')
            plt.ylabel('beta_i')
            plt.xlabel('i')
            plt.legend()
            plt.show()


    # making LaTex table for MSEs
    df = pd.DataFrame(MSEs, columns=['Degree', 'No resampling', 'train_test_split', 'k-fold CV'])
    df['Degree'] = df['Degree'].astype(int)
    tab = df.to_latex(index=False, float_format="%.5f")
    print(f"\n\n{tab}\n\n")

    # making LaTex table for R2s
    df = pd.DataFrame(R2s, columns=['Degree', 'No resampling', 'train_test_split'])
    df['Degree'] = df['Degree'].astype(int)
    tab = df.to_latex(index=False, float_format="%.5f")
    print(f"\n\n{tab}\n\n")

    # plot test vs train MSE using bootstrap or CV
    sns.set();
    plt.plot(degrees, error_test, label='Test MSE')
    plt.plot(degrees, error_train, label='Train MSE')
    plt.title('Train vs Test MSE for OLS using the 5-fold CV')
    plt.xlabel('Polynomial degree')
    plt.ylabel('Error')
    plt.legend()
    plt.show()

    # plot bias-variance tradeoff using bootstrap
    sns.set();
    plt.plot(degrees, error_test, label='MSE')
    plt.plot(degrees, bias, label='bias')
    plt.plot(degrees, variance, label='Variance')
    plt.title('Bias-Variance Tradeoff for OLS using the Bootstrap Method')
    plt.xlabel('Polynomial degree')
    plt.ylabel('Error')
    plt.legend()
    plt.show()
Beispiel #4
0
def plot_best_fit():
    X = reg.CreateDesignMatrix_X(x, y, n=5)
    beta = reg.OLS_fit(X, z_flat)
    z_tilde = reg.OLS_predict(beta, X)
    z_tilde = z_tilde.reshape(terrain_downsized.shape)
    plot_surface(z_tilde, 'Surface plot of OLS fifth order polynomial')