def load_data(self):
        # load train data
        self.train_labels, train_images = dataset.read()
        self.train_data_length = len(self.train_labels)
        self.number_of_dimensions = train_images[0].flatten().shape[0]

        #load test data
        self.test_labels, test_images = dataset.read(dataset='testing')
        self.test_data_length = len(self.test_labels)
        """Much Better accuracy without normalization"""
        #train_images = train_images / 255
        #test_images = test_images / 255

        self.train_images_flattened = train_images.reshape(
            train_images.shape[0], -1).T
        self.test_images_flattened = test_images.reshape(
            test_images.shape[0], -1).T

        train_mean = np.nanmean(self.train_images_flattened)
        self.train_images_flattened = self.train_images_flattened - train_mean
        self.test_images_flattened = self.test_images_flattened - train_mean

        self.weight_vector = np.random.random(
            (self.number_of_dimensions, self.number_of_classes)) * 0.000008
        self.one_hot_encoded_labels = self.create_one_hot_representation(
            self.train_labels)

        self.test_one_hot_encoded_labels = self.create_one_hot_representation(
            self.test_labels)
 def load_train_test(self):
     Y_train, X_train = ld.read()
     self.Y_test, self.X_test = ld.read('testing')
     X_train = np.reshape(X_train,(self.training_size,28*28))/255
     self.X_test =  np.reshape(self.X_test,(self.testing_size,28*28))/255
     Y_train = self.one_hot_representation(Y_train)
     #Y_test = self.one_hot_representation(Y_test)
     return (X_train, Y_train, self.X_test, self.Y_test)
def driver():

    Y_train,X_train = read()
    X_train = X_train.reshape((X_train.shape[0],784))
    X_train = X_train/float(255)
    #W = npy.zeros((784,10))
    W = npy.zeros((784,10))
    update_weights(X_train,Y_train,W)
Exemple #4
0
    def load_data(self):
        self.train_labels, train_images = dataset.read()
        self.train_data_length = len(self.train_labels)
        self.test_labels, test_images = dataset.read(dataset='testing')
        self.test_data_length = len(self.test_labels)
        """normalize data set"""
        train_images = train_images / 255
        test_images = test_images / 255
        self.train_images_flattened = train_images.reshape(
            train_images.shape[0], -1)
        self.test_images_flattened = test_images.reshape(
            test_images.shape[0], -1)

        #train_std = np.nanstd(self.train_images_flattened, axis=0)
        train_mean = np.nanmean(self.train_images_flattened)
        self.train_images_flattened = self.train_images_flattened - train_mean
        self.test_images_flattened = self.test_images_flattened - train_mean

        self.train_images_squared = np.einsum('ij,ij->i',
                                              self.train_images_flattened,
                                              self.train_images_flattened)
Exemple #5
0
def main():
    train_labels, train_images = load_dataset.read(
        "training",
        "/Users/samarth/Desktop/Fall 2018/CSE 575/Assignment 2/MNIST")
    train_images = np.reshape(train_images, (60000, 784)) / 255.0
    # train_images = getPCAData(train_images,50)
    # train_images = np.reshape(train_images, (60000, 784))

    print(len(train_images))
    test_labels, test_images = load_dataset.read(
        "testing",
        "/Users/samarth/Desktop/Fall 2018/CSE 575/Assignment 2/MNIST")
    test_images = np.reshape(test_images, (10000, 784)) / 255.0
    k_list = [1, 3, 5, 10, 30, 50, 70, 80, 90, 100]
    i = 0
    final = []
    for test_image in test_images:
        dist = compute_distance_matrix(100, train_images, train_labels,
                                       test_image)
        # print(dist)
        # print(pred)
        final.append(dist)

        print('Calculating the matrix for k = 100 for test image[' + str(i) +
              ']')
        i += 1

    with open('sorted_list_latest', 'wb') as fp:
        pickle.dump(final, fp)

    accuracy, accuracy_list = test_accuracy(k_list, test_images, test_labels)

    plt.plot(k_list, accuracy_list, color='g')
    plt.xlabel("K")
    plt.ylabel("Accuracy")
    plt.title("KNN Graph")
    plt.show()
def driver():

    Y_train,X_train = read()
    X_train = X_train.reshape((60000,784))
    X_train = X_train/float(255)

    Y_test,X_test = read("testing")
    X_test = X_test.reshape((10000,784))
    X_test = X_test/float(255)

    Y_pred_k_based = dict()
    accuracies = dict()

    for k in [1, 3, 5, 10, 30, 50, 70, 80, 90, 100]:
        Y_pred_k_based[k] = list()

    labels = compute_euclidean_distance(X_train,Y_train,X_test)

    for i in range(len(X_test)):
        all_l = labels[i][1]
        for k in [1, 3, 5, 10, 30, 50, 70, 80, 90, 100]:
            label = predict(all_l[0:k])
            lst = Y_pred_k_based[k]
            lst.append(label)

    for k in [1, 3, 5, 10, 30, 50, 70, 80, 90, 100]:
        lst_labels = Y_pred_k_based[k]
        count = 0
        for i in range(0,len(X_test)):
            if lst_labels[i] == Y_test[i]:
                count = count+1

        accuracies[k] = count/float(len(X_test))

    plot_graph(accuracies)
    dum_accuracies_file(accuracies)
Exemple #7
0
def loadData(data):
    #
    #Read data
    Label, Img1 = read(data)
    #
    #normalise data
    Img1 = Img1 / 255
    #
    #flattendata
    Img = np.asarray([img.flatten() for img in Img1])

    #create a frame
    Frame = pd.DataFrame(Img)
    Frame['label'] = Label

    return Frame
def predict(W):

    Y_test,X_test = read("testing")
    X_test = X_test.reshape((X_test.shape[0],784))
    X_test = X_test/float(255)
    Y_pred = []

    prod = npy.dot(X_test,W)
    values = softmax(prod)
    Y_pred = npy.argmax(values,axis=1)

    count = 0
    for i in range(0,X_test.shape[0]):
        if(Y_test[i] == Y_pred[i]):
            count=count+1

    return count/float(X_test.shape[0])
Exemple #9
0
# coding: utf-8

# In[3]:

import numpy as np
import load_dataset
import timeit
import collections
import timeit
from bisect import bisect
import scipy.spatial.distance as sd


# In[4]:

y_train , x_train= load_dataset.read("training","MNIST")
y_test, x_test= load_dataset.read("testing","MNIST")
x_train = x_train.reshape([60000,28*28])
x_test = x_test.reshape([10000,28*28])


# In[ ]:

distanceMatrix = sd.cdist(x_test,x_train)


# In[ ]:

k = [1,3,5,10,30,50,70,80,90,100]

Exemple #10
0
import numpy as np
#from scipy.special import expit
from load_dataset import read, show

[train_lab, train_img] = read()
[test_lab, test_img] = read("testing")

#reshape to nx784 matrix
train_img = train_img.reshape(
    train_img.shape[0], (train_img.shape[1] * train_img.shape[2])).astype(int)
test_img = test_img.reshape(
    test_img.shape[0], (test_img.shape[1] * test_img.shape[2])).astype(int)

#for test
#train_tl=train_lab[:10000]
#train_ti=train_img[:10000]
#test_tl=test_lab[:5000]
#test_ti=test_img[:5000]
#print(train_tl.shape,train_ti.shape,test_tl.shape,test_ti.shape)

#in logistic regression, all we need is find this w (ignore b)
w = np.zeros((10, 784), dtype=float)


#final predict function
def sigmoid(w, X):
    return (1 / (1 + np.exp(-np.matmul(X, w)))).astype(float)


#find w
#first step: relabel to 0,1. such that for 5, y[4][i] is a 0-1 label array.
Exemple #11
0
    def __init__(self, image, label):
        self.image = image
        self.label = label


class BallTreeNode:
    def __init__(self, image, radius, leafs):
        self.imageData = image
        self.radius = radius
        self.left = None
        self.right = None
        self.leafs = leafs


path = "../assignment2_data/"
trainingData = load_dataset.read("training", path)
testData = load_dataset.read("testing", path)

size = 200

trainLbls = trainingData[0][:size * 6]
trainImgs = trainingData[1][:size * 6]
testLbls = testData[0][:size]
testImgs = testData[1][:size]
ks = [1, 3, 5, 10, 30, 50, 70, 80, 90, 100]

trainingData = []
testData = []

for i in range(len(trainLbls)):
    image = [[0] * 28 for _ in range(28)]
from __future__ import division
import scipy
import numpy as np
import load_dataset as ld
import matplotlib.pyplot as plt

train_data = ld.read()
alpha = 0.001
y_train_data = train_data[0]
x_train_data = train_data[1].reshape([60000, 28 * 28]) / 255

test_data = ld.read('testing')
y_test_data = test_data[0]
x_test_data = test_data[1].reshape([10000, 28 * 28]) / 255

weights = np.tile(np.zeros(784), (9, 1))


def post_prob(x, y):
    if y == 9:
        return 1 / sum_exps(x)
    else:
        return np.exp(np.dot(weights[y], x)) / sum_exps(x)


def sum_exps(x):
    exp_array = []
    for weight in weights:
        exp_array.append(np.exp(np.dot(weight, x)))
    return (1 + np.sum(exp_array))
Exemple #13
0
import load_dataset
import time
import numpy as np

startTime = time.time()

path = "../assignment2_data/"
trainings = load_dataset.read("training", path)
tests = load_dataset.read("testing", path)


class ImageData:
    def __init__(self, label, image):
        self.label = label
        self.image = image
        self.image = image


testSize = 10000
trainSize = 60000
learningRate = 0.00002 / trainSize

trainingData = [
    ImageData(label,
              image.flatten().astype(float))
    for label, image in zip(trainings[0][:trainSize], trainings[1][:trainSize])
]
testData = [
    ImageData(label,
              image.flatten().astype(float))
    for label, image in zip(tests[0][:testSize], tests[1][:testSize])
 def load_train_test(self):
     Y_train, X_train = ld.read('training')
     Y_test, X_test = ld.read('testing')
     X_train = np.reshape(X_train, (training_size, 28 * 28))
     X_test = np.reshape(X_test, (testing_size, 28 * 28))
     return (X_train, Y_train, X_test, Y_test)