def test_MSE_wo_resamp_OLS(): ''' Function testing the OLS code without resampling by comparing to scikit-learn. ''' beta = reg.OLS_fit(X, z_flat) z_tilde = reg.OLS_predict(beta, X) MSE_own = mean_squared_error(z_flat, z_tilde) model = skl.LinearRegression() model.fit(X, z_flat) z_tilde_skl = model.predict(X) MSE_skl = mean_squared_error(z_flat, z_tilde_skl) tol = 1E-14 success = abs(MSE_own - MSE_skl) < tol msg = 'Error: The MSE for own OLS code did not match the MSE from scikit-learn.' assert success, msg
def plot_best_fit(data, title): X = reg.CreateDesignMatrix_X(x,y,n=5) beta = reg.OLS_fit(X,data) z_tilde = reg.OLS_predict(beta, X) z_tilde = z_tilde.reshape(z.shape) plot_surface(z_tilde, title)
def OLS_analysis(): print('Analysis for OLS') # matrices for table construction MSEs = np.zeros((maxdegree, 4)) R2s = np.zeros((maxdegree, 3)) # first column of tables: polynomial degrees degrees = [i for i in range(1,maxdegree+1)] MSEs[:,0] = degrees R2s[:,0] = degrees # for bias-variance tradeoff error_test = np.zeros(maxdegree) error_train = np.zeros(maxdegree) bias = np.zeros(maxdegree) variance = np.zeros(maxdegree) # find error as function of model complexity for degree in degrees: X = reg.CreateDesignMatrix_X(x,y,n=degree) # fitting without resampling beta = reg.OLS_fit(X,z_flat) z_tilde = reg.OLS_predict(beta, X) # MSE MSE = mean_squared_error(z_flat,z_tilde) MSEs[degree-1][1] = MSE # R² R_squared = r2_score(z_flat,z_tilde) R2s[degree-1][1] = R_squared #confidence interval of betas uppers, lowers = reg.CI_beta(X, beta) # fitting with resampling of data X_train, X_test, z_train, z_test = train_test_split(X, z_flat, test_size = 0.33) beta_train = reg.OLS_fit(X_train, z_train) z_tilde = reg.OLS_predict(beta_train, X_test) # MSE MSE = mean_squared_error(z_test,z_tilde) MSEs[degree-1][2] = MSE # R² R_squared = r2_score(z_test,z_tilde) R2s[degree-1][2] = R_squared # fitting with k-fold cross validation MSE_kf = reg.k_fold_cross_validation( X,z_flat, 5, reg.OLS_fit, reg.OLS_predict)[0] MSEs[degree-1][3] = MSE_kf # train vs test and bias/variance calculations using bootstrap e, e2, b, v = reg.bootstrap(X, z_flat, 100, fit_type=reg.OLS_fit, predict_type= reg.OLS_predict) error_test[degree-1] = e error_train[degree-1] = e2 bias[degree-1] = b variance[degree-1] = v # train vs test using CV #e, e2 = reg.k_fold_cross_validation(X, z_flat, k=5, fit_type=reg.OLS_fit, predict_type= reg.OLS_predict, tradeoff = False) #error_test[degree-1] = e #error_train[degree-1] = e2 # plot confidence interval for betas only for last degree if degree == maxdegree: sns.set(); plt.fill_between(np.arange(len(beta)), uppers, lowers, label='95% confidence interval', color=(.5,.5,.5,.2)) plt.plot(beta, label='beta', color="r") plt.title('95 percent Confidence Interval for Beta from OLS when Degree=5') plt.ylabel('beta_i') plt.xlabel('i') plt.legend() plt.show() # making LaTex table for MSEs df = pd.DataFrame(MSEs, columns=['Degree', 'No resampling', 'train_test_split', 'k-fold CV']) df['Degree'] = df['Degree'].astype(int) tab = df.to_latex(index=False, float_format="%.5f") print(f"\n\n{tab}\n\n") # making LaTex table for R2s df = pd.DataFrame(R2s, columns=['Degree', 'No resampling', 'train_test_split']) df['Degree'] = df['Degree'].astype(int) tab = df.to_latex(index=False, float_format="%.5f") print(f"\n\n{tab}\n\n") # plot test vs train MSE using bootstrap or CV sns.set(); plt.plot(degrees, error_test, label='Test MSE') plt.plot(degrees, error_train, label='Train MSE') plt.title('Train vs Test MSE for OLS using the 5-fold CV') plt.xlabel('Polynomial degree') plt.ylabel('Error') plt.legend() plt.show() # plot bias-variance tradeoff using bootstrap sns.set(); plt.plot(degrees, error_test, label='MSE') plt.plot(degrees, bias, label='bias') plt.plot(degrees, variance, label='Variance') plt.title('Bias-Variance Tradeoff for OLS using the Bootstrap Method') plt.xlabel('Polynomial degree') plt.ylabel('Error') plt.legend() plt.show()
def plot_best_fit(): X = reg.CreateDesignMatrix_X(x, y, n=5) beta = reg.OLS_fit(X, z_flat) z_tilde = reg.OLS_predict(beta, X) z_tilde = z_tilde.reshape(terrain_downsized.shape) plot_surface(z_tilde, 'Surface plot of OLS fifth order polynomial')