def process_optimized_knr(data): # we will process a loop to find the best performance for KNN for max_number_of_neighbors neighbor = 1 min_mean_sqr_error = 0 max_r2_score = 0 opt_neighbor = 0 global optimized_neighbor while neighbor < max_number_of_neighbors: model = KNeighborsRegressor() model.n_neighbors = neighbor model.fit(data["X_train"], data["y_train"]) predicted_values = model.predict(data["X_test"]) mean_sqr_error = mean_squared_error(data["y_test"], predicted_values) r2_score_calc = r2_score(data["y_test"], predicted_values) if max_r2_score < abs(r2_score_calc): min_mean_sqr_error = mean_sqr_error max_r2_score = r2_score_calc optimized_neighbor = neighbor neighbor = neighbor + 1 return { "name": "KNR", "data": { "neighbors": optimized_neighbor }, "mean_sqr_err": min_mean_sqr_error, "r2_score": max_r2_score }
plot_residual(y3_train, ridge3_y_train_pred, y3_test, ridge3_y_test_pred) plot_residual(y6_train, ridge6_y_train_pred, y6_test, ridge6_y_test_pred) plot_residual(y9_train, ridge9_y_train_pred, y9_test, ridge9_y_test_pred) from sklearn.neighbors import KNeighborsRegressor knn3 = KNeighborsRegressor() knn6 = KNeighborsRegressor() knn9 = KNeighborsRegressor() knn3_scores = [] knn6_scores = [] knn9_scores = [] n_neighbors_space= np.arange(1,11) for n in n_neighbors_space: knn3.n_neighbors = n knn3_cv_scores = cross_val_score(knn3, X, y3, cv=10) knn3_scores.append(np.mean(knn3_cv_scores)) knn6.n_neighbors = n knn6_cv_scores = cross_val_score(knn6, X, y6, cv=10) knn6_scores.append(np.mean(knn6_cv_scores)) knn9.n_neighbors = n knn9_cv_scores = cross_val_score(knn9, X, y9, cv=10) knn9_scores.append(np.mean(knn9_cv_scores)) knn3 = KNeighborsRegressor(n_neighbors=n_neighbors_space[np.argmax(knn3_scores)]) print("The best value for n_neighbors is: ", n_neighbors_space[np.argmax(knn3_scores)]) knn3.fit(X_train_std, y3_train)
mse = mean_squared_error(test_target_0, test_prediction) print(mse) # + [markdown] colab_type="text" id="pLW8kdDv5asl" # ## 과대적합 vs 과소적합 # * Overfitting: 훈련 date에서는 성적이 좋은데 테스트 data에서 성적이 나쁜 경우 # * Underfitting: 훈련 date 보다 테스트 data의 성적이 높은 경우 또는 둘 다 낮은 경우 # + colab={"base_uri": "https://localhost:8080/", "height": 35} colab_type="code" executionInfo={"elapsed": 1634, "status": "ok", "timestamp": 1587904061932, "user": {"displayName": "Haesun Park", "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhsWlS7sKQL-9fIkg3FmxpTMz_u-KDSs8y__P1ngQ=s64", "userId": "14935388527648823821"}, "user_tz": -540} id="ZoXIfmiAJaNw" outputId="99289bfd-0735-4f96-874b-e52dda1725c4" print('훈련자료의 `R^2` = ', knr.score(train_input, train_target_0)) print('테스트자료의 `R^2` = ', knr.score(test_input, test_target_0)) # + colab={"base_uri": "https://localhost:8080/", "height": 35} colab_type="code" executionInfo={"elapsed": 1628, "status": "ok", "timestamp": 1587904061932, "user": {"displayName": "Haesun Park", "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhsWlS7sKQL-9fIkg3FmxpTMz_u-KDSs8y__P1ngQ=s64", "userId": "14935388527648823821"}, "user_tz": -540} id="Jhu9abILLHjq" outputId="7fa6a8cf-1137-4ec6-e3df-14acfcb6be4d" # 이웃의 갯수를 3으로 설정합니다 knr.n_neighbors = 3 # 모델을 다시 훈련합니다 knr.fit(train_input, train_target_0) print('훈련자료의 `R^2` = ', knr.score(train_input, train_target_0)) print('테스트자료의 `R^2` = ', knr.score(test_input, test_target_0)) # + r2_train = np.zeros(20) r2_test = np.zeros(20) neighbors_n = np.zeros(20) for n in range(1, 21): knr.n_neighbors = n knr.fit(train_input, train_target_0)
5.9, 32.0, 40.0, 51.5, 70.0, 100.0, 78.0, 80.0, 85.0, 85.0, 110.0, 115.0, 125.0, 130.0, 120.0, 120.0, 130.0, 135.0, 110.0, 130.0, 150.0, 145.0, 150.0, 170.0, 225.0, 145.0, 188.0, 180.0, 197.0, 218.0, 300.0, 260.0, 265.0, 250.0, 250.0, 300.0, 320.0, 514.0, 556.0, 840.0, 685.0, 700.0, 700.0, 690.0, 900.0, 650.0, 820.0, 850.0, 900.0, 1015.0, 820.0, 1100.0, 1000.0, 1100.0, 1000.0, 1000.0 ]) # Split train set and test set train_input, test_input, train_target, test_target = train_test_split( perch_length, perch_weight, random_state=42) ## reshape train_input = train_input.reshape(-1, 1) test_input = test_input.reshape(-1, 1) ## KNN regression knr = KNeighborsRegressor() knr.fit(train_input, train_target) print(knr.score(test_input, test_target)) print(knr.score(train_input, train_target)) # First output is bigger than last one # So, this model is underfitted # This problem coule be solved by making model more complicated # Generally model could be more complicated by decreasing number of neighbors. knr.n_neighbors = 3 # Decrease number of neighbors knr.fit(train_input, train_target) print(knr.score(test_input, test_target)) print(knr.score(train_input, train_target))
# 예측이 타깃과 평균적으로 19g 정도 차이가 난다는 것을 의미 sc = knr.score(train_input, train_target) print(sc) # 0.9698823289099254 ''' 훈련 세트 R2 : 0.9698823289099254 테스트 세트 R2 : 0.992809406101064 ==> 테스트 세트 점수(r2)가 훈련세트 r2 보다 높음 ==> 과소적합(underfitting) [1] 과대 적합 (overfitting) : 훈련세트 점수 > 테스트 세트 점수 [2] 과소 적합 (underfitting) : 훈련세트 점수 < 테스트 세트 점수 또는 두 점수가 너무 낮은 경우 과소 적합 ==> 이를 해결하려면> ==> 모델을 복잡하게 만들면 된다 ==> KNR 알고리즘으로 복잡하게 만드는 방법은 K(이웃의 개수)를 줄이는 것 K(5디폴트) ==> 3으로 즐여서 다시 학습하자 ''' knr.n_neighbors = 3 knr.fit(train_input, train_target) sc1 = knr.score(train_input, train_target) sc2 = knr.score(test_input, test_target) print('훈련 세트의 r2:', sc1) # 0.9804899950518966 print('테스트 세트의 r2:', sc2) # 0.9746459963987609 ''' 과소 적합 문제 해결됨 훈련 세트의 r2: 0.9804899950518966 테스트 세트의 r2: 0.9746459963987609 ''' ''' KNR 모델의 문제 제기 # 길이 50cm, 무게 1.5kg인 농어의 무게를 예측해보자 ''' prd = knr.predict([[50]])
# Performance Info from sklearn import metrics print(f"Printing MAE error(avg abs residual): {metrics.mean_absolute_error(y_test, predicted_values)}") print(f"Printing MSE error: {metrics.mean_squared_error(y_test, predicted_values)}") print(f"Printing RMSE error: {np.sqrt(metrics.mean_squared_error(y_test, predicted_values))}") print('Variance score ( close to 1.0 the better ): %.2f' % r2_score(y_test, predicted_values)) # Using KNeighborsRegressor from sklearn.neighbors import KNeighborsRegressor # we will process a loop to find the best performance for KNN for max_number_of_neighbors max_number_of_neighbors = 50 neighbor = 1 min_mean_sqr_error = 0 max_r2_score = 0 opt_neighbor = 0 global optimized_neighbor while neighbor < max_number_of_neighbors: model = KNeighborsRegressor() model.n_neighbors = neighbor model.fit(X_train, y_train) predicted_values = model.predict(X_test) mean_sqr_error = mean_squared_error(y_test, predicted_values) r2_score_calc = r2_score(y_test, predicted_values) print(f"Printing MAE error(avg abs residual): {metrics.mean_absolute_error(y_test, predicted_values)}") print(f"Printing MSE error: {metrics.mean_squared_error(y_test, predicted_values)}") print(f"Printing RMSE error: {np.sqrt(metrics.mean_squared_error(y_test, predicted_values))}") print('Variance score ( close to 1.0 the better ): %.2f' % r2_score(y_test, predicted_values)) neighbor = neighbor + 1