Exemple #1
0
def main():
    r_wine_path = os.path.join("data", "winequality-red.csv")
    w_wine_path = os.path.join("data", "winequality-white.csv")
    df_wine_red = pd.read_csv(r_wine_path, sep=';')
    df_wine_white = pd.read_csv(w_wine_path, sep=';')

    X_r, y_r = df_wine_red.iloc[:, :11], df_wine_red.iloc[:, 11]
    X_w, y_w = df_wine_white.iloc[:, :11], df_wine_white.iloc[:, 11]

    stdsc_r = StandardScaler()
    stdsc_w = StandardScaler()
    X_r_train_std = stdsc_r.fit_transform(X_r)
    X_w_train_std = stdsc_w.fit_transform(X_w)

    knn_r = KNeighborsClassifier(n_neighbors=5)
    knn_w = KNeighborsClassifier(n_neighbors=5)

    sbs_r = SBS(knn_r, k_features=1)
    sbs_w = SBS(knn_w, k_features=1)
    sbs_r.fit(X_r_train_std, y_r)
    sbs_w.fit(X_w_train_std, y_w)

    plot_accuracy(sbs_r.subsets_, sbs_r.scores_)
    plot_accuracy(sbs_w.subsets_, sbs_w.scores_)

    feat_labels = df_wine_white.columns[:11]

    RandomForest.f_importance(feat_labels, X_r, y_r)
    RandomForest.f_importance(feat_labels, X_w, y_w)
Exemple #2
0
def main():
    # prepare sample data and target variable
    wine_data = WineData()
    X = wine_data.X
    y = wine_data.y

    # split sample data into training data and test data and standardize them
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)
    sc = StandardScaler().fit(X_train)
    X_train_std = sc.transform(X_train)                                                                         
    X_test_std = sc.transform(X_test)

    # fit estimators
    estimators = [
        LogisticRegression(C=100.0, random_state=1, solver='liblinear', multi_class='ovr'),
        SVC(C=1.0, kernel='linear', random_state=1).fit(X_train_std, y_train),
        KNeighborsClassifier(n_neighbors=5)
    ]
    sbs_estimators = [SBS(estimator=estimator, k_features=1).fit(X_train_std, y_train) for estimator in estimators]


    # plot score at each steps
    labels = ['logistic regression', 'SVM', 'KNN']
    for sbs, label in zip(sbs_estimators, labels):
        k_features = list(len(subset) for subset in sbs.subsets_)
        plt.plot(k_features, sbs.scores_, marker='o', label=label)
    plt.xlabel('number of feateres')
    plt.ylabel('accuracy')
    plt.grid()
    plt.legend()
    plt.show()

    # show results of SBS
    print('[score summary]')
    for sbs, estimator, label in zip(sbs_estimators, estimators, labels):
        print('estimator:', label)

        # search minimal subsets of features which achieves the best score
        indices = sbs.subsets_[0]
        for i in reversed(range(X.shape[1])):
            if sbs.scores_[i] == 1.0:
                indices = sbs.subsets_[i]
                break
        print('minimal subsets:', [wine_data.features[i] for i in indices])

        # compare score with all the features and one with minimal subsets
        estimator_all = estimator.fit(X_train_std, y_train)
        score_all = estimator_all.score(X_test_std, y_test)
        estimator_min = estimator.fit(X_train_std[:, indices], y_train)
        score_min = estimator_min.score(X_test_std[:, indices], y_test)
        print('score (all features)    :', score_all)
        print('score (minimal features):', score_min)
########################################### Split the datset ###########################################
X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=0)

############################## Feature scaling(using standardization) ##################################
stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train)
X_test_std = stdsc.fit_transform(X_test)

############################## Implement Sequental Backward Selection using KNN #########################
knn = KNeighborsClassifier(n_neighbors=2)
sbs = SBS(knn, k_features=1)
sbs.fit(X_train_std, y_train)

############################ plot the classification accuracy of the KNN classifier #####################
k_feat = [len(k) for k in sbs.subsets_]
plt.plot(k_feat, sbs.scores_, marker='o')
plt.ylim([0.7, 1.1])
plt.ylabel('Accuracy')
plt.xlabel('Number of features')
plt.grid()
plt.show()
# plt.savefig('sbm_classification_using_knn.png', dpi=300)

############################ get the features that yield best performance ################################
k5 = list(sbs.subsets_[8])
print(df_wine.columns[1:][k5])
Exemple #4
0
    header=None)
df_wine.columns = [
    'Class label', 'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash',
    'Magnesium', 'Total phenols', 'Flavanoids', 'Nonflavanoids phenols',
    'Proanthocyanins', 'Color intensity', 'Hue',
    'OD280/OD315 of diluted wines', 'Proline'
]

X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=0)

stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train)
X_test_std = stdsc.transform(X_test)

knn = KNeighborsClassifier(n_neighbors=2)
sbs = SBS(knn, 1)
sbs.fit(X_train_std, y_train)

k_feat = [len(k) for k in sbs.subsets_]
plt.plot(k_feat, sbs.scores_, marker='o')
plt.ylim([0.7, 1.1])
plt.ylabel('Accuracy')
plt.xlabel('Number of features')
plt.grid()
plt.show()
	'Color intensity', 'Hue',
	'OD280/OD315 of diluted wines',
	'Proline']

########################################### Split the datset ###########################################
X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

############################## Feature scaling(using standardization) ##################################
stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train)
X_test_std = stdsc.fit_transform(X_test)

############################## Implement Sequental Backward Selection using KNN #########################
knn = KNeighborsClassifier(n_neighbors=2)
sbs = SBS(knn, k_features=1)
sbs.fit(X_train_std, y_train)


############################ plot the classification accuracy of the KNN classifier #####################
k_feat = [len(k) for k in sbs.subsets_]
plt.plot(k_feat, sbs.scores_, marker='o')
plt.ylim([0.7, 1.1])
plt.ylabel('Accuracy')
plt.xlabel('Number of features')
plt.grid()
plt.show()
# plt.savefig('sbm_classification_using_knn.png', dpi=300)

############################ get the features that yield best performance ################################
k5 = list(sbs.subsets_[8])
Exemple #6
0
# z標準化
sc = StandardScaler()
sc.fit(X)
X_std = sc.transform(X)

n_of_trials = 30 # 試行回数
score_train_all = np.zeros(n_of_features) #部分集合毎の学習データに対するスコア格納用
score_test_all = np.zeros(n_of_features)  #部分集合毎のテストデータに対するスコア格納用

#==========================================================
# 本プログラムは交差検証ではなく,異なる乱数状態で複数回試行した平均を取っている
for k in range(0, n_of_trials):
	X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size = 0.3, random_state = k)

	lr = LinearRegression()
	sbs = SBS(lr, k_features=1, scoring=r2_score)
	sbs.fit(X_train, y_train)
	selected_features = list(sbs.subsets_[n_of_features - n_of_selected_features])
	print("Trial {:2d}; Best {} features: {}".format(k+1, n_of_selected_features, df.feature_names[selected_features]))

	score_train = np.array([])
	score_test = np.array([])

	#======================================================
	# 課題:SBSアルゴリズムで得られた各部分集合に対して,線形回帰モデルを適合させて
	# 学習データ,テストデータに対する決定係数を算出し,score_train,score_testに格納する.
	# ヒント:特徴の部分集合はsbs.subsets_に格納されている.
	
	[YOUR CODE HERE]
	#======================================================