def run_knn(df_knn): print( "\n\n----------------------K Nearest Neighbors----------------------\n\n" ) x = np.array(df_knn.ix[:, 0:]) y = np.array(df_knn['class']) x_train, x_test, y_train, y_test = split_train_test(x, y) neighbors = [ 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39, 41, 43, 45, 47, 49 ] cv_scores = [] for k in neighbors: knn = KNeighborsClassifier(n_neighbors=k) scores = cross_val_score(knn, x_train, y_train, cv=10, scoring='accuracy') cv_scores.append(scores.mean()) MSE = [1 - x for x in cv_scores] optimal_k = neighbors[MSE.index(min(MSE))] print("The optimal number of neighbors is %d" % optimal_k) plot_data(neighbors, MSE)
def run_random_forest(df_knn): print("\n\n----------------------Random Forest----------------------\n\n") x = np.array(df_knn.ix[:, 0:]) y = np.array(df_knn['class']) x_train, x_test, y_train, y_test = split_train_test(x,y) rf = RandomForestClassifier(random_state=1, n_estimators=250, min_samples_split=8, min_samples_leaf=4) rf.fit(x_train,y_train) pred = rf.predict(x_test) print("Accuracy: ",accuracy_score(y_test, pred))
def run_random_forest(df_knn): print("\n\n----------------------Random Forest----------------------\n\n") y = np.array(df_knn["class"]) x = np.array(df_knn.drop(columns="class")) x_train, x_test, y_train, y_test = split_train_test(x, y) rf = RandomForestClassifier(random_state=1, n_estimators=250, min_samples_split=8, min_samples_leaf=4) rf.fit(x_train, y_train) pred = rf.predict(x_test) test_pred = rf.predict(x_train) success_indices = np.where(y_train == 2)[0] test_frame = x_train[success_indices] print("location index:", success_indices) tst_y = y_train[success_indices] pred_frame = rf.predict(test_frame) print("\n\n----------------------Pred success----------------------\n\n", accuracy_score(tst_y, pred_frame)) failure_indices = np.where(y_train == 1)[0] test_frame = x_train[failure_indices] tst_y = y_train[failure_indices] pred_frame = rf.predict(test_frame) print("\n\n----------------------Pred Failure----------------------\n\n", accuracy_score(tst_y, pred_frame)) good_indices = np.where(y_train == 1)[0] test_frame = x_train[good_indices] tst_y = y_train[good_indices] pred_frame = rf.predict(test_frame) print("\n\n----------------------Pred good----------------------\n\n", accuracy_score(tst_y, pred_frame)) # print the Training Accuracy print( "\n\n----------------------Training Set Accuracy----------------------\n\n", accuracy_score(y_train, test_pred), ) print( "\n\n----------------------Testing Set Accuracy----------------------\n\n", accuracy_score(y_test, pred), ) return rf
def run_logistic_regression(df_knn): y = np.array(df_knn["class"]) x = np.array(df_knn.drop(columns="class")) x_train, x_test, y_train, y_test = split_train_test(x, y) logistic = LogisticRegression() logistic.fit(x_train, y_train) pred_train = logistic.predict(x_train) print ("Training accuracy: ", (logistic.score(x_train, y_train) * 100)) pred_test = logistic.predict(x_test) print ("Testing accuracy: ", (logistic.score(x_test, y_test) * 100)) success_indices = np.where(y_train==2)[0] test_frame = x_train[success_indices] tst_y = y_train[success_indices] pred_frame = logistic.predict(test_frame) print( "\n\n----------------------Pred success----------------------\n\n", logistic.score(test_frame, tst_y) ) failure_indices = np.where(y_train==1)[0] test_frame = x_train[failure_indices] tst_y = y_train[failure_indices] pred_frame = logistic.predict(test_frame) print( "\n\n----------------------Pred Failure----------------------\n\n", logistic.score(test_frame, tst_y) ) good_indices = np.where(y_train==1)[0] test_frame = x_train[good_indices] tst_y = y_train[good_indices] pred_frame = logistic.predict(test_frame) print( "\n\n----------------------Pred good----------------------\n\n", logistic.score(test_frame, tst_y) ) return logistic
def run_xgboost_imdb(df_knn): print( "\n\n----------------------XGBoost on IMDB dataset----------------------\n\n" ) x = np.array(df_knn.ix[:, 0:]) y = np.array(df_knn['class']) x_train, x_test, y_train, y_test = split_train_test(x, y) x_train = np.delete(x_train, [0, 1, 2, 3, 4, 5, 9, 44], axis=1) x_test = np.delete(x_test, [0, 1, 2, 3, 4, 5, 9, 44], axis=1) model = xgboost.XGBClassifier() model.fit(x_train, y_train) pred = model.predict(x_train) accuracy = accuracy_score(y_train, pred) print("Training accuracy: %.2f%%" % (accuracy * 100.0)) pred = model.predict(x_test) accuracy = accuracy_score(y_test, pred) print("Testing accuracy: %.2f%%" % (accuracy * 100.0))
# !/usr/bin/python # _*_ coding:utf-8 _*_ import numpy print numpy.version.version from split_dataset import split_train_test split_train_test('file1.txt')