Example #1
0
def kaggle(alpha, c, epsilon):
    X = temp["X"]
    test = temp["X_test"]
    x_labels = temp["y"]
    X, x_labels = permute_dictionaries(X, x_labels)
    X = scaler.transform(X)
    test = scaler.transform(test)
    X = np.hstack((X, np.ones((X.shape[0], 1))))
    test = np.hstack((test, np.ones((test.shape[0], 1))))
    w = np.zeros((13, 1))

    epoch = 0
    loss = np.inf
    while loss >= epsilon:
        x_predictions = sigmoid(np.matmul(X, w))

        dw = calc_gradient(X, x_labels, x_predictions, w, c)
        w = w - alpha * dw
        loss = obj(x_labels, x_predictions, w, c)
        epoch += 1
        if epoch % 1000 == 0:
            print(loss)
    test_predictions = classify(test, w)
    print(test_predictions)
    results_to_csv(test_predictions.flatten())
def mnist_hyperparameter():
    """returns the best value of C"""
    data = split_mnist_data()
    training_data = data["training_data"][:10000]
    training_labels = data["training_labels"][:10000]
    val_true = data["validation_labels"]
    C_vals = []
    accuracies = []
    for exp in range(-8, -1):
        C = 10**(exp)
        C_vals.append(C)
        clf = svm.SVC(kernel='linear', C=C)
        clf.fit(training_data, training_labels)
        val_pred = clf.predict(data["validation_data"])
        accuracy = accuracy_score(val_true, val_pred)
        accuracies.append(accuracy)
    top_acc = 0
    top_ind = 0
    top_C = 0
    for ind in range(len(accuracies)):
        if accuracies[ind] > top_acc:
            top_ind = ind
            top_acc = accuracies[ind]
            top_C = C_vals[ind]
    #print("Top C:", top_C)
    mnist_data = scipy.io.loadmat('data/mnist_data.mat')
    test_data = mnist_data["test_data"]
    training_data, training_labels = shuffle(mnist_data["training_data"],
                                             mnist_data["training_labels"])
    clf = svm.LinearSVC(C=top_C)
    clf.fit(training_data, training_labels)
    test_preds = clf.predict(test_data)
    results_to_csv(test_preds)
    return C_vals, accuracies
Example #3
0
def problem5(training_data, training_data_labels, test_data, C_value):	
	classifier = svm.LinearSVC(dual = False, random_state = 10, C = C_value)

	classifier.fit(training_data, np.ravel(training_data_labels))

	predict_training_results = classifier.predict(training_data)
	print(accuracy_score(np.ravel(training_data_labels), np.ravel(predict_training_results)))
	predict_test_results = classifier.predict(test_data)
	results_to_csv(predict_test_results)
Example #4
0
def problem6(training_data, training_data_labels, test_data, C_Value=0):

    classifier = svm.LinearSVC(random_state=40, C=10**C_Value)

    classifier.fit(training_data, np.ravel(training_data_labels))

    predict_training_results = classifier.predict(training_data)
    print(
        accuracy_score(np.ravel(training_data_labels),
                       np.ravel(predict_training_results)))
    predict_test_results = classifier.predict(test_data)
    results_to_csv(predict_test_results)
Example #5
0
def problem6(training_data, training_data_labels, test_data, linear, C_Value = 0):

	classifier = svm.LinearSVC(dual = False, random_state = 10, verbose = 1, max_iter = 1000000)

	if(not linear):
		classifier = svm.SVC(kernel = "linear", random_state = 10, verbose = 0)

	classifier.fit(training_data, np.ravel(training_data_labels))

	predict_training_results = classifier.predict(training_data)
	print(accuracy_score(np.ravel(training_data_labels), np.ravel(predict_training_results)))
	predict_test_results = classifier.predict(test_data)
	results_to_csv(predict_test_results)
Example #6
0
def kaggle(c):
    data = deskew_all(mnist_data["training_data"])
    labels = mnist_data["training_labels"]
    test_data = deskew_all(mnist_data["test_data"])
    partitioned_data = partition_data(data, labels)

    means = empirical_mean(partitioned_data)
    partitioned_covariances = empirical_cov(partitioned_data)
    priors = calc_priors(partitioned_data, len(data))
    samples = {'training': data, 'test': test_data}

    predictions = QDA(means, partitioned_covariances, priors, samples, c)
    train_predictions = predictions['training']
    test_predictions = predictions['test']
    print(error_rate(np.array([train_predictions]).T, labels))
    results_to_csv(np.array(test_predictions))
    return
def spam_hyperparameter():
    C_vals = []
    accuracies = []
    data = split_spam_data_crossval(5)
    for exp in range(-5, 4):
        print(exp)
        C = 10**exp
        C_vals.append(C)
        accuracy = 0
        for k in range(5):
            training_data = []
            training_labels = []
            for i in range(5):
                if not i == k:
                    training_data.append(data[i]["data"])
                    training_labels.append(data[i]["labels"])
            val_data = data[k]["data"]
            val_true = data[k]["labels"]
            clf = svm.LinearSVC(C=C)
            for ind in range(len(training_data)):
                clf.fit(training_data[ind], training_labels[ind])
            val_pred = clf.predict(val_data)
            accuracy += accuracy_score(val_true, val_pred)
        accuracies.append(accuracy / 5)
    top_acc = 0
    top_ind = 0
    top_C = 0
    for ind in range(len(accuracies)):
        if accuracies[ind] > top_acc:
            top_ind = ind
            top_acc = accuracies[ind]
            top_C = C_vals[ind]
    spam_data = scipy.io.loadmat('data/spam_data.mat')
    test_data = spam_data["test_data"]
    training_data, training_labels = shuffle(spam_data["training_data"],
                                             spam_data["training_labels"])
    clf = svm.LinearSVC(C=top_C)
    clf.fit(training_data, training_labels)
    test_preds = clf.predict(test_data)
    results_to_csv(test_preds)
    return C_vals, accuracies
def cifar10_hyperparameter():
    """returns the best value of C"""
    data = split_cifar10_data()
    training_data = data["training_data"][:2000]
    training_labels = data["training_labels"][:2000]
    val_true = data["validation_labels"]
    C_vals = []
    accuracies = []
    for exp in range(-2, 2):
        print(exp)
        C = 10**(exp)
        C_vals.append(C)
        clf = svm.LinearSVC(C=C)
        clf.fit(training_data, training_labels)
        val_pred = clf.predict(data["validation_data"])
        accuracy = accuracy_score(val_true, val_pred)
        accuracies.append(accuracy)
    top_acc = 0
    top_ind = 0
    top_C = 0
    for ind in range(len(accuracies)):
        if accuracies[ind] > top_acc:
            top_ind = ind
            top_acc = accuracies[ind]
            top_C = C_vals[ind]
    cifar10_data = scipy.io.loadmat('data/cifar10_data.mat')
    test_data = cifar10_data["test_data"]
    training_data, training_labels = shuffle(cifar10_data["training_data"],
                                             cifar10_data["training_labels"])
    clf = svm.LinearSVC(C=top_C)
    clf.fit(training_data, training_labels)
    test_preds = clf.predict(test_data)
    results_to_csv(test_preds)
    return C_vals, accuracies


#c, acc = cifar10_hyperparameter()
#print(c)
#print(acc)
Example #9
0
def evaluateModel(X, y, split, model, filename=None, Z=None):
    full = np.concatenate((X, y.reshape(-1, 1)), axis=1)
    np.random.shuffle(full)
    train = full[:split, :]
    val = full[split:, :]

    train_x, train_y = split_xy(train)
    train_y = train_y.reshape(-1, )

    val_x, val_y = split_xy(val)
    val_y = val_y.reshape(-1)

    model.fit(train_x, train_y)

    prediction_train = model.predict(train_x)
    prediction_val = model.predict(val_x)

    if filename:
        prediction_test = model.predict(Z)
        results_to_csv(prediction_test, filename)
        print("predictions saved")

    return measure_accuracy(prediction_train,
                            train_y), measure_accuracy(prediction_val, val_y)
Example #10
0
            v_error = classifier.errorRate(predictions, yv)
            depths.append(d)
            v_errors.append(v_error)
        plt.plot(depths, v_errors)
        plt.show()

    if spam_kaggle:
        print("Making Kaggle Predictions for Spam...")
        """
        Make predictions of the test data for kaggle submission.
        Set maxDepth=22 based on the validation test.
        """
        classifier = RandomForest(100, maxDepth=57)
        classifier.fit(X, y)
        predictions = np.array(classifier.predict(Z))
        results_to_csv(predictions)

    if spam_decision_tree:
        """
        Generate a random 80/20 training/validation split.
        """
        np.random.seed(42)
        np.random.shuffle(X)
        np.random.seed(42)
        np.random.shuffle(y)
        
        i = math.ceil(len(y)*0.8)
        Xt = X[:i]
        yt = y[:i]
        Xv = X[i:]
        yv = y[i:]
        if p < threshold:
            results.append(0)
        else:
            results.append(1)
    return np.array(results)


learning_rate, lambda_, iter_ = 0.00001, 0.01, 1500
w = np.zeros((x_train.shape[1], 1))
lost = []

for i in range(iter_):
    alpha = learning_rate / (i + 1)
    cost = cost_fn(x_train, y_train, w, lambda_)
    lost.append(cost)

    idx = np.random.randint(x_train.shape[0])
    y_hat = np.dot(x_train[idx:], w)
    grad = 2 * lambda_ * w - np.dot(x_train[idx:].T, (y_train[idx:] - sigmoid(y_hat)))
    w -= alpha * grad

pred = predict(w, x_val)
accuracy = sum(pred == y_val)/pred.shape[0]
print("val accuracy", accuracy)

test_data_norm = preprocessing.scale(data["X_test"])
predictions = predict(w, test_data_norm)
save_csv.results_to_csv(predictions)


print("accuracies:", bv_score)
plt.plot(C, bv_score, 'yo')
# plt.xscale('log')
plt.xlabel('C values')
plt.ylabel('accuracy_score')
plt.title('spam dataset K-Fold Cross-Validation')
# plt.show()
plt.savefig('figure_5.png')
plt.close()

############################
# Problem 6: Kaggle
############################

import save_csv

# (a) For the MNIST dataset, use C=10^-6.
model = svm.LinearSVC(C=0.000001)
model.fit(training_data, training_labels.ravel())
save_csv.results_to_csv(model.predict(data["test_data"]))

# (b) For the spam dataset, use C=14
model = svm.LinearSVC(C=14)
model.fit(training_data, training_labels.ravel())
save_csv.results_to_csv(model.predict(data["test_data"]))

# (c) For the cifar10 dataset
model = svm.LinearSVC()
model.fit(training_data, training_labels.ravel())
save_csv.results_to_csv(model.predict(data["test_data"]))
        xi = X[ind]
        yi = y[ind]
        update = (delta / it) * ((yi - s_stoch(xi, w)) * xi - lamb * w)
        w = w + update
    return w


dsgd_costs = []
for it in dif_iters:
    print(it)
    w_dsgd = stochasticGradientDescent(data, labels, w_dsgd, sgd_regularizer,
                                       sgd_step_size, it)
    J_val = J(data, labels, w_dsgd, 0)
    print(J_val)
    dsgd_costs.append(J_val)
plt.plot(num_iters, dsgd_costs, 'ro-')
plt.ylabel("Cost Function Value")
plt.xlabel("Number of Training Iterations")
plt.title(
    "Cost Value vs Training Iterations for Dynamic Stochastic Gradient Descent"
)
plt.show()

# Part 6 ////////////////////////////////////////////////////////////////////////////////////////////////////////
# train the model and run it on the test data

w = batchGradientDescent(data, labels, w_bgd, bgd_regularizer, bgd_step_size,
                         50)
preds = s(test_data, w)
results_to_csv(preds[:, 0])
Example #14
0
    else:
        raise NotImplementedError("Dataset %s not handled" % dataset)

    print("Features:", features)
    print("Train/test size:", X.shape, Z.shape)

    print("\n\nPart 0: constant classifier")
    print("Accuracy", 1 - np.sum(y) / y.size)

    # Basic decision tree
    print("\n\nPart (a-b): simplified decision tree")
    dt = DecisionTree(max_depth=3, feature_labels=features)
    dt.fit(X, y)
    print("Predictions", dt.predict(Z))
    save_csv.results_to_csv(dt.predict(Z))

    print("kfold")
    """
    ss = ShuffleSplit(n_splits=5)
    
    totals = []
    for train_index, test_index in ss.split(X):
        dt = DecisionTree(max_depth=3, feature_labels=features)
        dt.fit(X[train_index], y[train_index])
        predictions = dt.predict(X[test_index])
        total = 0
        for i in range(len(predictions)):
            total += predictions[i] == y[test_index][i]
        totals.append(total/len(predictions))
    print(np.mean(totals))