def test_MSE_w_kfold_Ridge(): ''' Function testing the k-fold code on Ridge by comparing to scikit-learn. ''' MSE_kf = reg.k_fold_cross_validation(X, z_flat, 5, reg.Ridge_fit, reg.Ridge_predict, 0.01)[0] cv = KFold(n_splits=5, shuffle=True) comparison = cross_validate(skl.Ridge(alpha=0.01, fit_intercept=False), X, z_flat, cv=cv, scoring=make_scorer(mean_squared_error)) MSE_kf_skl = np.mean(comparison["test_score"]) tol = 1E-2 success = abs(MSE_kf - MSE_kf_skl) < tol msg = 'Error: The MSE from own k-fold code for Ridge did not match the MSE from scikit-learn.' assert success, msg
def test_MSE_w_kfold_OLS(): ''' Function testing the k-fold code on OLS by comparing to scikit-learn. ''' MSE_kf = reg.k_fold_cross_validation(X, z_flat, 5, reg.OLS_fit, reg.OLS_predict)[0] cv = KFold(n_splits=5, shuffle=True) comparison = cross_validate(skl.LinearRegression(), X, z_flat, cv=cv, scoring=make_scorer(mean_squared_error)) MSE_kf_skl = np.mean(comparison["test_score"]) tol = 1E-3 success = abs(MSE_kf - MSE_kf_skl) < tol msg = 'Error: The MSE from own k-fold code for OLS did not match the MSE from scikit-learn.' assert success, msg
def OLS_analysis(): print('Analysis for OLS') # matrices for table construction MSEs = np.zeros((maxdegree, 4)) R2s = np.zeros((maxdegree, 3)) # first column of tables: polynomial degrees degrees = [i for i in range(1,maxdegree+1)] MSEs[:,0] = degrees R2s[:,0] = degrees # for bias-variance tradeoff error_test = np.zeros(maxdegree) error_train = np.zeros(maxdegree) bias = np.zeros(maxdegree) variance = np.zeros(maxdegree) # find error as function of model complexity for degree in degrees: X = reg.CreateDesignMatrix_X(x,y,n=degree) # fitting without resampling beta = reg.OLS_fit(X,z_flat) z_tilde = reg.OLS_predict(beta, X) # MSE MSE = mean_squared_error(z_flat,z_tilde) MSEs[degree-1][1] = MSE # R² R_squared = r2_score(z_flat,z_tilde) R2s[degree-1][1] = R_squared #confidence interval of betas uppers, lowers = reg.CI_beta(X, beta) # fitting with resampling of data X_train, X_test, z_train, z_test = train_test_split(X, z_flat, test_size = 0.33) beta_train = reg.OLS_fit(X_train, z_train) z_tilde = reg.OLS_predict(beta_train, X_test) # MSE MSE = mean_squared_error(z_test,z_tilde) MSEs[degree-1][2] = MSE # R² R_squared = r2_score(z_test,z_tilde) R2s[degree-1][2] = R_squared # fitting with k-fold cross validation MSE_kf = reg.k_fold_cross_validation( X,z_flat, 5, reg.OLS_fit, reg.OLS_predict)[0] MSEs[degree-1][3] = MSE_kf # train vs test and bias/variance calculations using bootstrap e, e2, b, v = reg.bootstrap(X, z_flat, 100, fit_type=reg.OLS_fit, predict_type= reg.OLS_predict) error_test[degree-1] = e error_train[degree-1] = e2 bias[degree-1] = b variance[degree-1] = v # train vs test using CV #e, e2 = reg.k_fold_cross_validation(X, z_flat, k=5, fit_type=reg.OLS_fit, predict_type= reg.OLS_predict, tradeoff = False) #error_test[degree-1] = e #error_train[degree-1] = e2 # plot confidence interval for betas only for last degree if degree == maxdegree: sns.set(); plt.fill_between(np.arange(len(beta)), uppers, lowers, label='95% confidence interval', color=(.5,.5,.5,.2)) plt.plot(beta, label='beta', color="r") plt.title('95 percent Confidence Interval for Beta from OLS when Degree=5') plt.ylabel('beta_i') plt.xlabel('i') plt.legend() plt.show() # making LaTex table for MSEs df = pd.DataFrame(MSEs, columns=['Degree', 'No resampling', 'train_test_split', 'k-fold CV']) df['Degree'] = df['Degree'].astype(int) tab = df.to_latex(index=False, float_format="%.5f") print(f"\n\n{tab}\n\n") # making LaTex table for R2s df = pd.DataFrame(R2s, columns=['Degree', 'No resampling', 'train_test_split']) df['Degree'] = df['Degree'].astype(int) tab = df.to_latex(index=False, float_format="%.5f") print(f"\n\n{tab}\n\n") # plot test vs train MSE using bootstrap or CV sns.set(); plt.plot(degrees, error_test, label='Test MSE') plt.plot(degrees, error_train, label='Train MSE') plt.title('Train vs Test MSE for OLS using the 5-fold CV') plt.xlabel('Polynomial degree') plt.ylabel('Error') plt.legend() plt.show() # plot bias-variance tradeoff using bootstrap sns.set(); plt.plot(degrees, error_test, label='MSE') plt.plot(degrees, bias, label='bias') plt.plot(degrees, variance, label='Variance') plt.title('Bias-Variance Tradeoff for OLS using the Bootstrap Method') plt.xlabel('Polynomial degree') plt.ylabel('Error') plt.legend() plt.show()
def Lasso_analysis(): print('Analysis for Lasso') lambdas = [10**i for i in range(-5,2)] # lambda values to test degrees = [i for i in range(1,maxdegree+1)] # complexities to test MSEs = np.zeros((maxdegree,4)) # collection of MSEs for a given lambda MSEs[:,0] = degrees # first column in table is complexity R2s = np.zeros((maxdegree,3)) # collection of R"s for a given lambda R2s[:,0] = degrees # first column in table is complexity MSEs_kfold = np.zeros((len(lambdas),maxdegree)) # for plotting heatmap # for bias-variance tradeoff error_test = np.zeros(maxdegree) error_train = np.zeros(maxdegree) bias = np.zeros(maxdegree) variance = np.zeros(maxdegree) # find error as function of complexity and lambda for degree in range(1,maxdegree+1): X = reg.CreateDesignMatrix_X(x,y,n=degree) lam_index = 0 for lam in lambdas: MSE_kfold = reg.k_fold_cross_validation( X,z_flat, 5, reg.Lasso_fit, reg.Lasso_predict, _lambda=lam)[0] MSEs_kfold[lam_index][degree-1] = MSE_kfold lam_index += 1 # fitting without resampling model = reg.Lasso_fit(X,z_flat,_lambda=1) z_tilde = reg.Lasso_predict(model, X) # MSE MSE = mean_squared_error(z_flat,z_tilde) MSEs[degree-1][1] = MSE # R² R_squared = r2_score(z_flat,z_tilde) print(R_squared) R2s[degree-1][1] = R_squared # fitting with resampling X_train, X_test, z_train, z_test = train_test_split(X, z_flat, test_size = 0.33) model_train = reg.Lasso_fit(X_train, z_train, _lambda=1) z_tilde = reg.Lasso_predict(model_train, X_test) # MSE MSE = mean_squared_error(z_test,z_tilde) MSEs[degree-1][2] = MSE # R² R_squared = r2_score(z_test,z_tilde) R2s[degree-1][2] = R_squared # kfold into table MSEs[degree-1][3] = MSEs_kfold[6][degree-1] # train vs test and bias/variance calculations using bootstrap for chosen lambda e, e2, b, v = reg.bootstrap(X, z_flat, 100, fit_type=reg.Lasso_fit, predict_type= reg.Lasso_predict, _lambda=1) error_test[degree-1] = e error_train[degree-1] = e2 bias[degree-1] = b variance[degree-1] = v # train vs test using CV e, e2 = reg.k_fold_cross_validation(X, z_flat, k=5, fit_type=reg.Lasso_fit, predict_type= reg.Lasso_predict, tradeoff = False, _lambda=1) error_test[degree-1] = e error_train[degree-1] = e2 # making LaTex table for MSEs (use the one corresponding to best lambda) to compare with OLS df = pd.DataFrame(MSEs, columns=['Degree', 'No resampling', 'train_test_split', 'k-fold CV']) df['Degree'] = df['Degree'].astype(int) tab = df.to_latex(index=False, float_format="%.5f") print(f"\n\n{tab}\n\n") # making LaTex table for R2s (use the one corresponding to best lambda) to compare with OLS df = pd.DataFrame(R2s, columns=['Degree', 'No resampling', 'train_test_split']) df['Degree'] = df['Degree'].astype(int) tab = df.to_latex(index=False, float_format="%.5f") print(f"\n\n{tab}\n\n") # plotting heatmap with kfold CV to choose best combination of degree and lambda fig, ax = plt.subplots() sns.heatmap(MSEs_kfold,xticklabels=degrees, yticklabels=np.log10(lambdas), annot=True, fmt='.2f') plt.xlabel('Polynomial degree') plt.ylabel('log(lambda)') plt.title('MSE as function of lambda and complexity with Lasso regression') plt.tight_layout() plt.show() # plot test vs train MSE using bootstrap or CV for chosen lambda sns.set(); plt.plot(degrees, error_test, label='Test MSE') plt.plot(degrees, error_train, label='Train MSE') plt.title('Train vs Test MSE for Lasso using the 5-fold CV Method') plt.xlabel('Polynomial degree') plt.ylabel('Error') plt.legend() plt.show() # plot bias-variance tradeoff using bootstrap for chosen lambda sns.set(); plt.plot(degrees, error_test, label='MSE') plt.plot(degrees, bias, label='bias') plt.plot(degrees, variance, label='Variance') plt.title('Bias-Variance Tradeoff for Lasso using the Bootstrap Method') plt.xlabel('Polynomial degree') plt.ylabel('Error') plt.legend() plt.show() # find minimum MSE from kFold print(MSEs_kfold.min())
import numpy as np from regression import slice_data from regression import k_fold_cross_validation from regression import polynomial_sframe print "*** Selecting an L2 penalty via cross-validation" sales = gl.SFrame('kc_house_data.gl/') sales = sales.sort(['sqft_living','price']) # split the data set into training, validation and testing. (train_valid, test) = sales.random_split(.9, seed=1) train_valid_shuffled = gl.toolkits.cross_validation.shuffle(train_valid, random_seed=1) n = len(train_valid_shuffled) k = 10 # 10-fold cross-validation # i starts with 1 (start, end) = slice_data(n,k,3) validation4 = train_valid_shuffled[start:end] print "Test data slice. Answer should be 536234: ", int(round(validation4['price'].mean(), 0)) feature = polynomial_sframe(train_valid_shuffled['sqft_living'],15) for l2 in np.logspace(1, 7, num=13): rss = k_fold_cross_validation(10, l2, train_valid_shuffled,'price',['sqft_living']) print "For L2_penalty is ", l2, ", validation error is", rss