Esempio n. 1
0
def main():
    r_wine_path = os.path.join("data", "winequality-red.csv")
    w_wine_path = os.path.join("data", "winequality-white.csv")
    df_wine_red = pd.read_csv(r_wine_path, sep=';')
    df_wine_white = pd.read_csv(w_wine_path, sep=';')

    X_r, y_r = df_wine_red.iloc[:, :11], df_wine_red.iloc[:, 11]
    X_w, y_w = df_wine_white.iloc[:, :11], df_wine_white.iloc[:, 11]

    stdsc_r = StandardScaler()
    stdsc_w = StandardScaler()
    X_r_train_std = stdsc_r.fit_transform(X_r)
    X_w_train_std = stdsc_w.fit_transform(X_w)

    knn_r = KNeighborsClassifier(n_neighbors=5)
    knn_w = KNeighborsClassifier(n_neighbors=5)

    sbs_r = SBS(knn_r, k_features=1)
    sbs_w = SBS(knn_w, k_features=1)
    sbs_r.fit(X_r_train_std, y_r)
    sbs_w.fit(X_w_train_std, y_w)

    plot_accuracy(sbs_r.subsets_, sbs_r.scores_)
    plot_accuracy(sbs_w.subsets_, sbs_w.scores_)

    feat_labels = df_wine_white.columns[:11]

    RandomForest.f_importance(feat_labels, X_r, y_r)
    RandomForest.f_importance(feat_labels, X_w, y_w)
########################################### Split the datset ###########################################
X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=0)

############################## Feature scaling(using standardization) ##################################
stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train)
X_test_std = stdsc.fit_transform(X_test)

############################## Implement Sequental Backward Selection using KNN #########################
knn = KNeighborsClassifier(n_neighbors=2)
sbs = SBS(knn, k_features=1)
sbs.fit(X_train_std, y_train)

############################ plot the classification accuracy of the KNN classifier #####################
k_feat = [len(k) for k in sbs.subsets_]
plt.plot(k_feat, sbs.scores_, marker='o')
plt.ylim([0.7, 1.1])
plt.ylabel('Accuracy')
plt.xlabel('Number of features')
plt.grid()
plt.show()
# plt.savefig('sbm_classification_using_knn.png', dpi=300)

############################ get the features that yield best performance ################################
k5 = list(sbs.subsets_[8])
print(df_wine.columns[1:][k5])
	'OD280/OD315 of diluted wines',
	'Proline']

########################################### Split the datset ###########################################
X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

############################## Feature scaling(using standardization) ##################################
stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train)
X_test_std = stdsc.fit_transform(X_test)

############################## Implement Sequental Backward Selection using KNN #########################
knn = KNeighborsClassifier(n_neighbors=2)
sbs = SBS(knn, k_features=1)
sbs.fit(X_train_std, y_train)


############################ plot the classification accuracy of the KNN classifier #####################
k_feat = [len(k) for k in sbs.subsets_]
plt.plot(k_feat, sbs.scores_, marker='o')
plt.ylim([0.7, 1.1])
plt.ylabel('Accuracy')
plt.xlabel('Number of features')
plt.grid()
plt.show()
# plt.savefig('sbm_classification_using_knn.png', dpi=300)

############################ get the features that yield best performance ################################
k5 = list(sbs.subsets_[8])
print(df_wine.columns[1:][k5])
Esempio n. 4
0
sc = StandardScaler()
sc.fit(X)
X_std = sc.transform(X)

n_of_trials = 30 # 試行回数
score_train_all = np.zeros(n_of_features) #部分集合毎の学習データに対するスコア格納用
score_test_all = np.zeros(n_of_features)  #部分集合毎のテストデータに対するスコア格納用

#==========================================================
# 本プログラムは交差検証ではなく,異なる乱数状態で複数回試行した平均を取っている
for k in range(0, n_of_trials):
	X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size = 0.3, random_state = k)

	lr = LinearRegression()
	sbs = SBS(lr, k_features=1, scoring=r2_score)
	sbs.fit(X_train, y_train)
	selected_features = list(sbs.subsets_[n_of_features - n_of_selected_features])
	print("Trial {:2d}; Best {} features: {}".format(k+1, n_of_selected_features, df.feature_names[selected_features]))

	score_train = np.array([])
	score_test = np.array([])

	#======================================================
	# 課題:SBSアルゴリズムで得られた各部分集合に対して,線形回帰モデルを適合させて
	# 学習データ,テストデータに対する決定係数を算出し,score_train,score_testに格納する.
	# ヒント:特徴の部分集合はsbs.subsets_に格納されている.
	
	[YOUR CODE HERE]
	#======================================================

	score_train_all += score_train
Esempio n. 5
0
SBS(sequential backward selection).
"""

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sbs import SBS
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
from wine_comon_funcs import wine_initializer

x_train_std, y_train, x_test_std, y_test, __ = wine_initializer()

knn = KNeighborsClassifier(n_neighbors=2)
sbs = SBS(knn, k_features=1)
sbs.fit(x_train_std, y_train)

k_feat = list(len(k) for k in sbs.subsets_)
plt.plot(k_feat, sbs.scores_, marker='o')
plt.ylim([0.7, 1.1])
plt.ylabel("Accuracy")
plt.xlabel("Number of features")
plt.grid()
plt.show()

k_5 = list(sbs.subsets_[8])
knn.fit(
    x_train_std[:, k_5], y_train
)  # Higher accuracy of test data with less overfitting using less dimensions
# print(df_wine.columns[1:][k_5])
print('Training accuracy:', knn.score(x_train_std[:, k_5], y_train))