def run(X_train, y_train, X_test, y_test, _k=[1]):
    """
    Script to run the experiment given some data. It would train the Knn (compute n x n distances).
    And then predict labels for the test set.

    :param X_train: np mat, dimensions: N x D
    :param y_train: np mat, dimensions: N
    :param X_test: np mat, dimensions: M x D
    :param y_test: np mat, dimensions: M
    :param _k: list of int. How many k's to test for.

    :return: y_pred: np mat, dimensions: M
    """
    # Compute distances:
    dists = mlBasics.compute_euclidean_distances(X_train, X_test)

    print "Distances computed"

    # For all k,
    for k in _k:

        # Predict labels
        y_test_pred = mlBasics.predict_labels(dists, y_train, k=k)

        print '{0:0.02f}'.format(np.mean(y_test_pred == y_test) * 100), "of test examples classified correctly. k =", key
def task1d(X_train, y_train, X_test, y_test):
    """
    Compare the computational cost of classifying the test data
    :param X_train:
    :param y_train:
    :param X_test:
    :param y_test:
    :return:
    """
    # k = 1 neighbor and the k neighbor which performed best in 1c question
    k_set = [1, 3]

    dists = mlBasics.compute_euclidean_distances(X_train, X_test)
    # The list of computational costs for two ks
    c_cost_lists = []
    acc_lists = []
    for k in k_set:
        start_time = time.time()
        y_test_pred = mlBasics.predict_labels(dists, y_train, k)
        duration_time = time.time() - start_time
        c_cost_lists.append(duration_time)
        acc = np.mean(y_test_pred == y_test) * 100
        acc_lists.append(acc)
    increased_time = c_cost_lists[1] - c_cost_lists[0]
    acc_my_classifier = acc_lists[1]
    print "The increased computation time is ", increased_time
    print "The accuracy of my classifier if ", acc_my_classifier
def task1b(X_train, y_train, X_test, y_test):
    # Randomly subset the training set , test the first 10 images, get the confusion matrix
    exm_num_per_class = 100
    k_set = [1, 5]
    X_training = []
    y_training = []
    for i in range(10):
        x_train_i = X_train[y_train == i]
        y_train_i = y_train[y_train == i]
        random_indexes = np.random.choice(range(len(y_train_i)),
                                          size=exm_num_per_class,
                                          replace=False)
        # random_indexes = np.random.randint(0, len(y_train_i), size=exm_num_per_class)
        x_train_i = x_train_i[random_indexes]
        y_train_i = y_train_i[random_indexes]
        X_training.extend(x_train_i.copy())
        y_training.extend(y_train_i.copy())
    X_testing = X_test[0:10]
    y_testing = y_test[0:10]
    # Test on test data
    for k in k_set:
        # 1) Compute distances:
        dists = mlBasics.compute_euclidean_distances(np.array(X_training),
                                                     np.array(X_testing))
        # 2) Run the code below and predict labels:
        y_test_pred = mlBasics.predict_labels(dists, y_training, k=k)
        print 'For k = ', k, ' : {0:0.02f}'.format(
            np.mean(y_test_pred == y_testing) *
            100), "of test examples classified correctly."
        cm_1 = confusion_matrix(y_testing, y_test_pred, labels=range(10))
        title = "Confusion Matrix k = " + str(k)
        plot_confusion_matrix(cm_1, range(10), title)
def five_fold_CV(dataset, labels, k):
    segments = []
    segment_labels = []

    fold = 5
    seg_size = dataset.shape[0] / fold
    for i in range(1, fold + 1):
        segments = segments + [dataset[(i - 1) * seg_size:i * seg_size]]
        segment_labels = segment_labels + [
            labels[(i - 1) * seg_size:i * seg_size]
        ]

    acc = []
    for i in range(0, fold):
        tSet = np.empty((0, 784), int)
        for j in range(0, fold):
            if (j != i):
                tSet = np.vstack((tSet, segments[i]))

        dists = mlBasics.compute_euclidean_distances(tSet, segments[i])
        test_pred = mlBasics.predict_labels(dists, segment_labels[i], k)

        if (k == 1):
            acc.append(np.mean(test_pred == segment_labels[i]) * 100)
        else:
            classifications = np.array(
                list(
                    map(
                        lambda y: np.argmax(np.bincount(y.astype(np.int64))).
                        astype(np.float64), test_pred)))
            acc.append(np.mean(classifications == segment_labels[i]) * 100)

    #print("Accuracies from 5-fold CV: ", acc)
    return np.mean(acc)
def pred_accuracy(train_X, train_y, test_X, test_y, k):
    # compute distances
    dists = mlBasics.compute_euclidean_distances(train_X, test_X)

    # calculate the predictions
    pred = mlBasics.predict_labels2(dists, train_y, k)

    # return accuracy
    return pred, np.mean(pred == test_y) * 100
Exemple #6
0
def cross_validation(x_train, y_train, knn=1, K=5):
    kf = KFold(n_splits=K, shuffle=True)
    accuracy_all_fold = []
    i = 0
    for train, test in kf.split(x_train):
        dists = mlBasics.compute_euclidean_distances(x_train[train],
                                                     x_train[test])
        y_test_pred = mlBasics.predict_labels(dists, y_train[train], k=knn)
        accuracy = np.mean(y_test_pred == y_train[test]) * 100
        accuracy_all_fold.append(accuracy)
        print('K= {1} Fold {0} Accuracy {2:.2f}'.format(knn, i, accuracy))
        i += 1
    return accuracy_all_fold
Exemple #7
0
def cross_validation(X, Y, num_folds=5, k=1):
    # Dividing data into various folds
    X_folds = np.array(np.array_split(X, num_folds))
    y_folds = np.array(np.array_split(Y, num_folds))

    # List holding acuracies for k
    accuracies = []
    for i in xrange(num_folds):
        train_id = [x for x in xrange(num_folds) if x != i]
        X_train_data = np.concatenate(X_folds[train_id])
        Y_train_data = np.concatenate(y_folds[train_id])
        dists = mlBasics.compute_euclidean_distances(X_train_data, X_folds[i])
        y_test_pred = mlBasics.predict_labels(dists, Y_train_data, k)
        accuracy = np.mean(y_test_pred == y_folds[i])
        accuracies.append(accuracy)

    print 'for k=%d, mean acc=%f ' % (k, np.mean(accuracies))
    #for val in accuracies:
    #    print 'accuracy = %f'%(val)

    return np.mean(accuracies)
Exemple #8
0
        label_sample_idx[label] = [idx]

sample_idx = np.empty(1000, dtype='int')
i, j = 0, 100
for label in label_sample_idx:
    sample_idx[i:j] = np.random.choice(label_sample_idx[label], size=100)
    i, j = j, j + 100

x_train_sample, y_train_sample = X_train[sample_idx], y_train[sample_idx]

# Reshape images
x_train_sample = np.reshape(x_train_sample, (x_train_sample.shape[0], -1))
X_test = np.reshape(X_test, (X_test.shape[0], -1))

print('Compute Distances')
dists = mlBasics.compute_euclidean_distances(x_train_sample, X_test)

print('For k=1 Neighbour')
y_test_pred = mlBasics.predict_labels(dists, y_train_sample, k=1)

print('For k=5 Neighbours')
y_test_pred_5 = mlBasics.predict_labels(dists, y_train_sample, k=5)

from sklearn.metrics import confusion_matrix

conf_1 = confusion_matrix(y_test, y_test_pred)
conf_5 = confusion_matrix(y_test, y_test_pred_5)

print('{0:0.02f}'.format(np.mean(y_test_pred == y_test) * 100),
      'of test examples classified correctly for k=1 Neighbour(s).')
print('Confusion Matrix for k=1 Neighbour(s)')
@author: fame
"""

from load_mnist import load_mnist
import hw1_knn as mlBasics
import numpy as np

# Load data - two class
X_train, y_train = load_mnist('training', [0, 1])
X_test, y_test = load_mnist('testing', [0, 1])

# Load data - ALL CLASSES
#X_train, y_train = load_mnist('training'  )
#X_test, y_test = load_mnist('testing'   )

# Reshape the image data into rows
X_train = np.reshape(X_train, (X_train.shape[0], -1))
X_test = np.reshape(X_test, (X_test.shape[0], -1))

# Test on test data
#1) Compute distances:
dists = mlBasics.compute_euclidean_distances(X_train, X_test)

#2) Run the code below and predict labels:
y_test_pred = mlBasics.predict_labels(dists, y_train)

#3) Report results
# you should get following message '99.91 of test examples classified correctly.'
print('{0:0.02f}'.format(np.mean(y_test_pred == y_test) * 100),
      "of test examples classified correctly.")
def task1c(X_train, y_train):
    '''
    implement the 5-fold cross validation
    :param X_train:
    :param y_train:
    :param X_test:
    :param y_test:
    :return:
    '''
    # K hyper parameters need to be test
    k_set = range(1, 16)
    # n-fold
    fold_num = 5
    # example numbers per class, we need 100 images for one class, eg. class 1
    exm_num_per_class = 100
    # Initial the folds for training and folds for testing, and their corresponding label
    folds_training = [[] for i in range(fold_num)]
    label_for_folds_training = [[] for i in range(fold_num)]
    folds_testing = [[] for i in range(fold_num)]
    label_for_folds_testing = [[] for i in range(fold_num)]
    # For each class (number 1-10), sample 100 images randomly.
    for i in range(10):
        x_train_i = X_train[y_train == i]
        y_train_i = y_train[y_train == i]
        random_indexes = np.random.choice(range(len(y_train_i)),
                                          size=exm_num_per_class,
                                          replace=False)
        # random_indexes = np.random.randint(0, len(y_train_i), size=exm_num_per_class)
        x_train_i = x_train_i[random_indexes]
        y_train_i = y_train_i[random_indexes]
        stepsize = exm_num_per_class / fold_num
        for n in range(fold_num):
            # Use the one fold to test, and the rest to train
            folds_testing[n].append(x_train_i[stepsize * n:stepsize * (n + 1)])
            label_for_folds_testing[n].append(y_train_i[stepsize * n:stepsize *
                                                        (n + 1)])
            folds_training[n].append(
                np.delete(x_train_i,
                          range(stepsize * n, stepsize * (n + 1)),
                          axis=0))
            label_for_folds_training[n].append(
                np.delete(y_train_i,
                          range(stepsize * n, stepsize * (n + 1)),
                          axis=0))
    # Variable result_records = {1: [acc1,...,acc5],2:[acc1,...,acc5]...} is to record the accuracies for each class
    result_records = {}
    for n in range(fold_num):
        # Following is to reshape
        folds_train = np.reshape(folds_training[n],
                                 (-1, np.shape(folds_training[n])[-1]))
        folds_test = np.reshape(folds_testing[n],
                                (-1, np.shape(folds_testing[n])[-1]))
        label_for_folds_train = np.reshape(label_for_folds_training[n], -1)
        label_for_folds_test = np.reshape(label_for_folds_testing[n], -1)
        # Compute the distance
        dists = mlBasics.compute_euclidean_distances(folds_train, folds_test)
        # Iterate k
        for k in k_set:
            if k not in result_records.keys():
                result_records[k] = []
            y_test_pred = mlBasics.predict_labels(dists,
                                                  label_for_folds_train,
                                                  k=k)
            acc = np.mean(y_test_pred == label_for_folds_test) * 100
            # add the acc to the result_records
            result_records[k].append(acc)
            # print '{0:0.02f}'.format(acc), "of test examples classified correctly."
    mean_acc_record = [
        np.mean(result_records[key]) for key in result_records.keys()
    ]
    plot_acc_for_k(mean_acc_record, result_records.keys())
Exemple #11
0
def test_all_data(X_train, y_train, X_test, y_test, k):
    dists = mlBasics.compute_euclidean_distances(X_train, X_test)
    y_test_pred = mlBasics.predict_labels(dists, y_train, k)
    return np.mean(y_test_pred == y_test) * 100
Exemple #12
0
if __name__ == '__main__':
    '''
    (a) Load data - ALL class 
    '''
    X_train, y_train, X_test, y_test = load_all_data()
    '''
    (b) Load 1000 training example, 100 from each class and visualize 1 and 5 nearest neighbour 
    for first 10 test examples
    '''
    sample_size = 100  #samples per class

    X_1000, Y_1000 = extract_samples_per_class(X_train, y_train, sample_size)

    # k=1
    dists = mlBasics.compute_euclidean_distances(X_1000, X_test)
    y_test_pred_1 = mlBasics.predict_labels(dists, Y_1000, k=1)
    print '################## part b #########################'
    print 'for k=1, {0:0.02f}'.format(
        np.mean(y_test_pred_1 == y_test) *
        100), "of test examples classified correctly."

    # k=5
    y_test_pred_5 = mlBasics.predict_labels(dists, Y_1000, k=5)
    print 'for k=5, {0:0.02f}'.format(
        np.mean(y_test_pred_5 == y_test) *
        100), "of test examples classified correctly."

    #Confusion Matrix
    C_1 = metrics.confusion_matrix(y_test, y_test_pred_1)
    C_5 = metrics.confusion_matrix(y_test, y_test_pred_5)