def kaggle(alpha, c, epsilon): X = temp["X"] test = temp["X_test"] x_labels = temp["y"] X, x_labels = permute_dictionaries(X, x_labels) X = scaler.transform(X) test = scaler.transform(test) X = np.hstack((X, np.ones((X.shape[0], 1)))) test = np.hstack((test, np.ones((test.shape[0], 1)))) w = np.zeros((13, 1)) epoch = 0 loss = np.inf while loss >= epsilon: x_predictions = sigmoid(np.matmul(X, w)) dw = calc_gradient(X, x_labels, x_predictions, w, c) w = w - alpha * dw loss = obj(x_labels, x_predictions, w, c) epoch += 1 if epoch % 1000 == 0: print(loss) test_predictions = classify(test, w) print(test_predictions) results_to_csv(test_predictions.flatten())
def mnist_hyperparameter(): """returns the best value of C""" data = split_mnist_data() training_data = data["training_data"][:10000] training_labels = data["training_labels"][:10000] val_true = data["validation_labels"] C_vals = [] accuracies = [] for exp in range(-8, -1): C = 10**(exp) C_vals.append(C) clf = svm.SVC(kernel='linear', C=C) clf.fit(training_data, training_labels) val_pred = clf.predict(data["validation_data"]) accuracy = accuracy_score(val_true, val_pred) accuracies.append(accuracy) top_acc = 0 top_ind = 0 top_C = 0 for ind in range(len(accuracies)): if accuracies[ind] > top_acc: top_ind = ind top_acc = accuracies[ind] top_C = C_vals[ind] #print("Top C:", top_C) mnist_data = scipy.io.loadmat('data/mnist_data.mat') test_data = mnist_data["test_data"] training_data, training_labels = shuffle(mnist_data["training_data"], mnist_data["training_labels"]) clf = svm.LinearSVC(C=top_C) clf.fit(training_data, training_labels) test_preds = clf.predict(test_data) results_to_csv(test_preds) return C_vals, accuracies
def problem5(training_data, training_data_labels, test_data, C_value): classifier = svm.LinearSVC(dual = False, random_state = 10, C = C_value) classifier.fit(training_data, np.ravel(training_data_labels)) predict_training_results = classifier.predict(training_data) print(accuracy_score(np.ravel(training_data_labels), np.ravel(predict_training_results))) predict_test_results = classifier.predict(test_data) results_to_csv(predict_test_results)
def problem6(training_data, training_data_labels, test_data, C_Value=0): classifier = svm.LinearSVC(random_state=40, C=10**C_Value) classifier.fit(training_data, np.ravel(training_data_labels)) predict_training_results = classifier.predict(training_data) print( accuracy_score(np.ravel(training_data_labels), np.ravel(predict_training_results))) predict_test_results = classifier.predict(test_data) results_to_csv(predict_test_results)
def problem6(training_data, training_data_labels, test_data, linear, C_Value = 0): classifier = svm.LinearSVC(dual = False, random_state = 10, verbose = 1, max_iter = 1000000) if(not linear): classifier = svm.SVC(kernel = "linear", random_state = 10, verbose = 0) classifier.fit(training_data, np.ravel(training_data_labels)) predict_training_results = classifier.predict(training_data) print(accuracy_score(np.ravel(training_data_labels), np.ravel(predict_training_results))) predict_test_results = classifier.predict(test_data) results_to_csv(predict_test_results)
def kaggle(c): data = deskew_all(mnist_data["training_data"]) labels = mnist_data["training_labels"] test_data = deskew_all(mnist_data["test_data"]) partitioned_data = partition_data(data, labels) means = empirical_mean(partitioned_data) partitioned_covariances = empirical_cov(partitioned_data) priors = calc_priors(partitioned_data, len(data)) samples = {'training': data, 'test': test_data} predictions = QDA(means, partitioned_covariances, priors, samples, c) train_predictions = predictions['training'] test_predictions = predictions['test'] print(error_rate(np.array([train_predictions]).T, labels)) results_to_csv(np.array(test_predictions)) return
def spam_hyperparameter(): C_vals = [] accuracies = [] data = split_spam_data_crossval(5) for exp in range(-5, 4): print(exp) C = 10**exp C_vals.append(C) accuracy = 0 for k in range(5): training_data = [] training_labels = [] for i in range(5): if not i == k: training_data.append(data[i]["data"]) training_labels.append(data[i]["labels"]) val_data = data[k]["data"] val_true = data[k]["labels"] clf = svm.LinearSVC(C=C) for ind in range(len(training_data)): clf.fit(training_data[ind], training_labels[ind]) val_pred = clf.predict(val_data) accuracy += accuracy_score(val_true, val_pred) accuracies.append(accuracy / 5) top_acc = 0 top_ind = 0 top_C = 0 for ind in range(len(accuracies)): if accuracies[ind] > top_acc: top_ind = ind top_acc = accuracies[ind] top_C = C_vals[ind] spam_data = scipy.io.loadmat('data/spam_data.mat') test_data = spam_data["test_data"] training_data, training_labels = shuffle(spam_data["training_data"], spam_data["training_labels"]) clf = svm.LinearSVC(C=top_C) clf.fit(training_data, training_labels) test_preds = clf.predict(test_data) results_to_csv(test_preds) return C_vals, accuracies
def cifar10_hyperparameter(): """returns the best value of C""" data = split_cifar10_data() training_data = data["training_data"][:2000] training_labels = data["training_labels"][:2000] val_true = data["validation_labels"] C_vals = [] accuracies = [] for exp in range(-2, 2): print(exp) C = 10**(exp) C_vals.append(C) clf = svm.LinearSVC(C=C) clf.fit(training_data, training_labels) val_pred = clf.predict(data["validation_data"]) accuracy = accuracy_score(val_true, val_pred) accuracies.append(accuracy) top_acc = 0 top_ind = 0 top_C = 0 for ind in range(len(accuracies)): if accuracies[ind] > top_acc: top_ind = ind top_acc = accuracies[ind] top_C = C_vals[ind] cifar10_data = scipy.io.loadmat('data/cifar10_data.mat') test_data = cifar10_data["test_data"] training_data, training_labels = shuffle(cifar10_data["training_data"], cifar10_data["training_labels"]) clf = svm.LinearSVC(C=top_C) clf.fit(training_data, training_labels) test_preds = clf.predict(test_data) results_to_csv(test_preds) return C_vals, accuracies #c, acc = cifar10_hyperparameter() #print(c) #print(acc)
def evaluateModel(X, y, split, model, filename=None, Z=None): full = np.concatenate((X, y.reshape(-1, 1)), axis=1) np.random.shuffle(full) train = full[:split, :] val = full[split:, :] train_x, train_y = split_xy(train) train_y = train_y.reshape(-1, ) val_x, val_y = split_xy(val) val_y = val_y.reshape(-1) model.fit(train_x, train_y) prediction_train = model.predict(train_x) prediction_val = model.predict(val_x) if filename: prediction_test = model.predict(Z) results_to_csv(prediction_test, filename) print("predictions saved") return measure_accuracy(prediction_train, train_y), measure_accuracy(prediction_val, val_y)
v_error = classifier.errorRate(predictions, yv) depths.append(d) v_errors.append(v_error) plt.plot(depths, v_errors) plt.show() if spam_kaggle: print("Making Kaggle Predictions for Spam...") """ Make predictions of the test data for kaggle submission. Set maxDepth=22 based on the validation test. """ classifier = RandomForest(100, maxDepth=57) classifier.fit(X, y) predictions = np.array(classifier.predict(Z)) results_to_csv(predictions) if spam_decision_tree: """ Generate a random 80/20 training/validation split. """ np.random.seed(42) np.random.shuffle(X) np.random.seed(42) np.random.shuffle(y) i = math.ceil(len(y)*0.8) Xt = X[:i] yt = y[:i] Xv = X[i:] yv = y[i:]
if p < threshold: results.append(0) else: results.append(1) return np.array(results) learning_rate, lambda_, iter_ = 0.00001, 0.01, 1500 w = np.zeros((x_train.shape[1], 1)) lost = [] for i in range(iter_): alpha = learning_rate / (i + 1) cost = cost_fn(x_train, y_train, w, lambda_) lost.append(cost) idx = np.random.randint(x_train.shape[0]) y_hat = np.dot(x_train[idx:], w) grad = 2 * lambda_ * w - np.dot(x_train[idx:].T, (y_train[idx:] - sigmoid(y_hat))) w -= alpha * grad pred = predict(w, x_val) accuracy = sum(pred == y_val)/pred.shape[0] print("val accuracy", accuracy) test_data_norm = preprocessing.scale(data["X_test"]) predictions = predict(w, test_data_norm) save_csv.results_to_csv(predictions)
print("accuracies:", bv_score) plt.plot(C, bv_score, 'yo') # plt.xscale('log') plt.xlabel('C values') plt.ylabel('accuracy_score') plt.title('spam dataset K-Fold Cross-Validation') # plt.show() plt.savefig('figure_5.png') plt.close() ############################ # Problem 6: Kaggle ############################ import save_csv # (a) For the MNIST dataset, use C=10^-6. model = svm.LinearSVC(C=0.000001) model.fit(training_data, training_labels.ravel()) save_csv.results_to_csv(model.predict(data["test_data"])) # (b) For the spam dataset, use C=14 model = svm.LinearSVC(C=14) model.fit(training_data, training_labels.ravel()) save_csv.results_to_csv(model.predict(data["test_data"])) # (c) For the cifar10 dataset model = svm.LinearSVC() model.fit(training_data, training_labels.ravel()) save_csv.results_to_csv(model.predict(data["test_data"]))
xi = X[ind] yi = y[ind] update = (delta / it) * ((yi - s_stoch(xi, w)) * xi - lamb * w) w = w + update return w dsgd_costs = [] for it in dif_iters: print(it) w_dsgd = stochasticGradientDescent(data, labels, w_dsgd, sgd_regularizer, sgd_step_size, it) J_val = J(data, labels, w_dsgd, 0) print(J_val) dsgd_costs.append(J_val) plt.plot(num_iters, dsgd_costs, 'ro-') plt.ylabel("Cost Function Value") plt.xlabel("Number of Training Iterations") plt.title( "Cost Value vs Training Iterations for Dynamic Stochastic Gradient Descent" ) plt.show() # Part 6 //////////////////////////////////////////////////////////////////////////////////////////////////////// # train the model and run it on the test data w = batchGradientDescent(data, labels, w_bgd, bgd_regularizer, bgd_step_size, 50) preds = s(test_data, w) results_to_csv(preds[:, 0])
else: raise NotImplementedError("Dataset %s not handled" % dataset) print("Features:", features) print("Train/test size:", X.shape, Z.shape) print("\n\nPart 0: constant classifier") print("Accuracy", 1 - np.sum(y) / y.size) # Basic decision tree print("\n\nPart (a-b): simplified decision tree") dt = DecisionTree(max_depth=3, feature_labels=features) dt.fit(X, y) print("Predictions", dt.predict(Z)) save_csv.results_to_csv(dt.predict(Z)) print("kfold") """ ss = ShuffleSplit(n_splits=5) totals = [] for train_index, test_index in ss.split(X): dt = DecisionTree(max_depth=3, feature_labels=features) dt.fit(X[train_index], y[train_index]) predictions = dt.predict(X[test_index]) total = 0 for i in range(len(predictions)): total += predictions[i] == y[test_index][i] totals.append(total/len(predictions)) print(np.mean(totals))