def get_best_knn_prediction(x_train, x_test, y_train, y_test): best_k = 0 best_predictions = [] max_test_score = 0 ks = range(1, 10) test_scores = [] train_scores = [] for k in ks: results_knn, test_score, train_score = models.KNN(x_train, y_train, x_test, y_test, k) test_scores.append(test_score) train_scores.append(train_score) if test_score > max_test_score: best_k = k best_predictions = results_knn max_test_score = test_score print("Best K: ", best_k) plt.plot(ks, test_scores, label='Test Accuracy') plt.plot(ks, train_scores, label='Train Accuracy') plt.legend() plt.xlabel("K") plt.ylabel("Accuracy (%)") plt.title("") plt.show() return best_predictions, max_test_score, best_k
def main(): args = get_args() X, y = load(args.data, 'train') test_X, _ = load(args.data, args.test_split) if args.dr_algorithm is not None: num_train = X.shape[0] concat_data = np.concatenate((X, test_X)) start = time.time() if args.dr_algorithm == 'pca': reduced_X = models.PCA(concat_data, args.target_dim).fit(concat_data) elif args.dr_algorithm == 'lle': reduced_X = models.LLE(concat_data, args.target_dim, args.lle_k).fit(concat_data) else: raise Exception('Invalid dimensionality reduction algorithm') end = time.time() print(f"dimensionality reduction took {end - start} seconds!") X = reduced_X[:num_train] test_X = reduced_X[num_train:] model = models.KNN(args.knn_k) model.fit(X, y) y_hat = model.predict(test_X) np.savetxt(args.predictions_file, y_hat, fmt='%d')
def models_compare(x, y): X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.30) results_svm = models.SVM(X_train, y_train, X_test) sns.regplot(results_svm, y_test, color='red', label='SVM') Evaluationmatrix(y_test, results_svm, "SVM") results_tree = models.TREE(X_train, y_train, X_test) sns.regplot(results_tree, y_test, color='green', label='TREE') Evaluationmatrix(y_test, results_tree, "TREE") results_ridge = models.RIDGE(X_train, y_train, X_test) sns.regplot(results_ridge, y_test, color='orange', label='RIDGE') Evaluationmatrix(y_test, results_ridge, "RIDGE") results_knn = models.KNN(X_train, y_train, X_test) sns.regplot(results_knn, y_test, color='yellow', label='KNN') Evaluationmatrix(y_test, results_knn, "KNN") results_lr = models.LR(X_train, y_train, X_test) sns.regplot(results_lr, y_test, color='blue', label='LR') Evaluationmatrix(y_test, results_lr, "LR") results_rfr = models.RFR(X_train, y_train, X_test) sns.regplot(results_rfr, y_test, color='black', label='RFR') Evaluationmatrix(y_test, results_rfr, "RFR") plt.title('Models Comparison') plt.xlabel('Predicted Ratings') plt.ylabel('Actual Ratings') plt.legend() plt.show()
for train_index, test_index in group_kfold.split(Xdata, Ydata, groups): model.train(Xdata[train_index], Ydata[train_index]) Ypred = model.test(Xdata[test_index]) confusion = sklearn.metrics.confusion_matrix(Ydata[test_index], Ypred, labels=features.labels) if sum_confusion is None: sum_confusion = np.zeros(confusion.shape) sum_confusion += confusion return sum_confusion / k def select_best_model(Xdata, Ydata, models): avg_accuracies = [(i, k_fold_cross_validate(Xdata, Ydata, 4, model)) for i, model in enumerate(models)] print(avg_accuracies) return max(avg_accuracies, key=operator.itemgetter(1)) allfeatures = features.compute_or_read_features() Xdata, Ydata = to_numpy_arrays(allfeatures) models = [models.RandomForest(200, 'gini'), models.LogisticRegression(), models.SVMNonLinear('rbf'), models.SVMNonLinear('sigmoid'), models.NeuralNet(), models.KNN()] #best = select_best_model(Xdata, Ydata, models) #print(best) for model in models: cm = k_fold_confusion_matrix(Xdata, Ydata, 4, model) save_confusion_matrix(cm, model._name) print(f"Confusion matrix for {model._name} saved")
model.clf.fit(df_test, df_test_target) df_test_new_pred = model.clf.predict(df_test_new) print("--- %s seconds ---" % (time.time() - start_time)) # Confusion Matrix models.plot_confusion_matrix(df_test_new_target, df_test_new_pred, names) # models.plot_loss_curve(model.clf) print( metrics.classification_report(df_test_new_target, df_test_new_pred, target_names=names)) if (model_name.upper() == "K"): model = models.KNN() # df_test = df_test[:2000] # df_test_target = df_test_target[:2000] # df_test_new = df_test_new[:2000] # df_test_new_target = df_test_new[:2000] param_grid = {'n_neighbors': np.arange(0, 40, 5)} # Plot validation curves for different parameters models.plot_validation_curve(model.clf, df_test, df_test_target, 'n_neighbors', np.arange(1, 40, 5), 5) models.plot_validation_curve(model.clf, df_test, df_test_target, 'weights', ['uniform', 'distance'], 5) # start_time = time.time() # gs_model = models.conduct_grid_search(model.clf, df_test, df_test_target, param_grid)
output_path = args.output_path + args.note + '/' output_file = args.output_path + args.note + '/' + args.output_file test_data = TestFile(test_file, train_data, output_path, output_file) if args.optim == 'sgd': optim = torch.optim.SGD elif args.optim == 'adam': optim = torch.optim.Adam elif args.optim == 'adagrad': optim = torch.optim.Adagrad else: raise NotImplementedError print('training') if args.model == 'knn': model = models.KNN(args.k, args.f, test_data.write, args.seed, args.num_seed) model.train_model(train_data, args.no_valid) elif args.model == 'lr': model = models.RidgeRegression(args.lamb, args.f, test_data.write, args.seed, args.num_seed) model.train_model(train_data, args.no_valid) elif args.model == 'nb': model = models.NaiveBayes(args.model_type, args.f, test_data.write, args.seed, args.num_seed) model.train_model(train_data, args.no_valid) elif args.model == 'svm': model = models.SVM(args.kernel, args.c, args.f, test_data.write, args.seed, args.num_seed) model.train_model(train_data, args.no_valid) elif args.model == 'mlp': model = models.MLP(