def main(): r_wine_path = os.path.join("data", "winequality-red.csv") w_wine_path = os.path.join("data", "winequality-white.csv") df_wine_red = pd.read_csv(r_wine_path, sep=';') df_wine_white = pd.read_csv(w_wine_path, sep=';') X_r, y_r = df_wine_red.iloc[:, :11], df_wine_red.iloc[:, 11] X_w, y_w = df_wine_white.iloc[:, :11], df_wine_white.iloc[:, 11] stdsc_r = StandardScaler() stdsc_w = StandardScaler() X_r_train_std = stdsc_r.fit_transform(X_r) X_w_train_std = stdsc_w.fit_transform(X_w) knn_r = KNeighborsClassifier(n_neighbors=5) knn_w = KNeighborsClassifier(n_neighbors=5) sbs_r = SBS(knn_r, k_features=1) sbs_w = SBS(knn_w, k_features=1) sbs_r.fit(X_r_train_std, y_r) sbs_w.fit(X_w_train_std, y_w) plot_accuracy(sbs_r.subsets_, sbs_r.scores_) plot_accuracy(sbs_w.subsets_, sbs_w.scores_) feat_labels = df_wine_white.columns[:11] RandomForest.f_importance(feat_labels, X_r, y_r) RandomForest.f_importance(feat_labels, X_w, y_w)
########################################### Split the datset ########################################### X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) ############################## Feature scaling(using standardization) ################################## stdsc = StandardScaler() X_train_std = stdsc.fit_transform(X_train) X_test_std = stdsc.fit_transform(X_test) ############################## Implement Sequental Backward Selection using KNN ######################### knn = KNeighborsClassifier(n_neighbors=2) sbs = SBS(knn, k_features=1) sbs.fit(X_train_std, y_train) ############################ plot the classification accuracy of the KNN classifier ##################### k_feat = [len(k) for k in sbs.subsets_] plt.plot(k_feat, sbs.scores_, marker='o') plt.ylim([0.7, 1.1]) plt.ylabel('Accuracy') plt.xlabel('Number of features') plt.grid() plt.show() # plt.savefig('sbm_classification_using_knn.png', dpi=300) ############################ get the features that yield best performance ################################ k5 = list(sbs.subsets_[8]) print(df_wine.columns[1:][k5])
'OD280/OD315 of diluted wines', 'Proline'] ########################################### Split the datset ########################################### X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) ############################## Feature scaling(using standardization) ################################## stdsc = StandardScaler() X_train_std = stdsc.fit_transform(X_train) X_test_std = stdsc.fit_transform(X_test) ############################## Implement Sequental Backward Selection using KNN ######################### knn = KNeighborsClassifier(n_neighbors=2) sbs = SBS(knn, k_features=1) sbs.fit(X_train_std, y_train) ############################ plot the classification accuracy of the KNN classifier ##################### k_feat = [len(k) for k in sbs.subsets_] plt.plot(k_feat, sbs.scores_, marker='o') plt.ylim([0.7, 1.1]) plt.ylabel('Accuracy') plt.xlabel('Number of features') plt.grid() plt.show() # plt.savefig('sbm_classification_using_knn.png', dpi=300) ############################ get the features that yield best performance ################################ k5 = list(sbs.subsets_[8]) print(df_wine.columns[1:][k5])
sc = StandardScaler() sc.fit(X) X_std = sc.transform(X) n_of_trials = 30 # 試行回数 score_train_all = np.zeros(n_of_features) #部分集合毎の学習データに対するスコア格納用 score_test_all = np.zeros(n_of_features) #部分集合毎のテストデータに対するスコア格納用 #========================================================== # 本プログラムは交差検証ではなく,異なる乱数状態で複数回試行した平均を取っている for k in range(0, n_of_trials): X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size = 0.3, random_state = k) lr = LinearRegression() sbs = SBS(lr, k_features=1, scoring=r2_score) sbs.fit(X_train, y_train) selected_features = list(sbs.subsets_[n_of_features - n_of_selected_features]) print("Trial {:2d}; Best {} features: {}".format(k+1, n_of_selected_features, df.feature_names[selected_features])) score_train = np.array([]) score_test = np.array([]) #====================================================== # 課題:SBSアルゴリズムで得られた各部分集合に対して,線形回帰モデルを適合させて # 学習データ,テストデータに対する決定係数を算出し,score_train,score_testに格納する. # ヒント:特徴の部分集合はsbs.subsets_に格納されている. [YOUR CODE HERE] #====================================================== score_train_all += score_train
SBS(sequential backward selection). """ import pandas as pd from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sbs import SBS from sklearn.neighbors import KNeighborsClassifier import matplotlib.pyplot as plt from wine_comon_funcs import wine_initializer x_train_std, y_train, x_test_std, y_test, __ = wine_initializer() knn = KNeighborsClassifier(n_neighbors=2) sbs = SBS(knn, k_features=1) sbs.fit(x_train_std, y_train) k_feat = list(len(k) for k in sbs.subsets_) plt.plot(k_feat, sbs.scores_, marker='o') plt.ylim([0.7, 1.1]) plt.ylabel("Accuracy") plt.xlabel("Number of features") plt.grid() plt.show() k_5 = list(sbs.subsets_[8]) knn.fit( x_train_std[:, k_5], y_train ) # Higher accuracy of test data with less overfitting using less dimensions # print(df_wine.columns[1:][k_5]) print('Training accuracy:', knn.score(x_train_std[:, k_5], y_train))