Beispiel #1
0
def validate_k(X, y, k_list):
    """
    Perform 5-fold validation on dataset X, y with
    varying K values from k_list. Used to derive some
    intuition about what range of k to validate over.

    Returns a 5 x len(k_list) array of test accuracies
    """
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
    accuracy = np.zeros((5, len(k_list)))
    i = 0
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        for j, k in enumerate(k_list):
            classifier = kNN()
            classifier.train(X_train, y_train)
            y_pred = classifier.predict(X_test, k)
            acc = (y_pred == y_test).mean()
            accuracy[i, j] = acc
            print("Q1 -- TIME: {} Fold {}, k: {}, Accuracy: {}".format(
                datetime.now().strftime('%Y-%m-%d %H:%M:%S'), i + 1, k, acc))
        i += 1
    statistics = np.zeros((2, len(k_list)))
    statistics[0, :] = np.mean(accuracy, 0)
    statistics[1, :] = np.std(accuracy, 0)
    return accuracy
Beispiel #2
0
def q1_regularised(X, y, k_list):
    """
    For 20 different test/train splits, calculate
    the classification accuracy on the test set using
    the values in k_list.

    Returns:
    test_acc_ar: np.array: test errors for all 20 runs,
        across all hyperparameters in k_list.
    """
    test_acc_ar = np.zeros((20, len(k_list)))
    for i in range(20):
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=i,
                                                            stratify=y)
        print("Run {}".format(i))
        for j, k in enumerate(k_list):
            classifier = kNN()
            classifier.train(X_train, y_train)
            y_pred = classifier.predict(X_test, k)
            test_acc = (y_pred == y_test).mean()
            test_acc_ar[i, j] = test_acc
            print("k Parameter: {}, Accuracy: {}".format(k, test_acc))
    return test_acc_ar
Beispiel #3
0
def result(xtrain,xtest,ytrain,ytest,k):
	print 'Results for Knn with k =',k
	
	clf = knn.kNN(k=k,distance_m = distance)
	clf.fit(xtrain,ytrain)
	prd = clf.predict(xtest)
	
	print "Accuracy:",accuracy_score(ytest,prd)
	print 'Confusion Matrix'
	print confusion_matrix(ytest,prd)
	return accuracy_score(ytest,prd)
def main():
    """Do a test if called from the command line"""
    data = pd.read_csv(DATAFILE, header=None, names=HEADER)
    X = data[FEATURES]
    y = data.species

    model = kNN(k=10)
    model.fit(X, y)
    print "Accuracy on training set:", model.score(X, y)

    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.80)
    model.fit(X_train, y_train)
    print "Accuracy on test set:    ", model.score(X_test, y_test)
Beispiel #5
0
def main():
    """Do a test if called from the command line"""
    data = pd.read_csv(DATAFILE, header=None, names=HEADER)
    X = data[FEATURES]
    y = data.species

    model = kNN(k=10)
    model.fit(X, y)
    print "Accuracy on training set:", model.score(X, y)

    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.80)
    model.fit(X_train, y_train)
    print "Accuracy on test set:    ", model.score(X_test, y_test)
Beispiel #6
0
def main():
    # List of patient objects
    patient_list = parse_csv()

    # create the ten folds
    ten_folds_strat_list = stratify_data(patient_list)

    # create the classifer objects
    knn = kNN()
    naive_bayes = naiveBayes()

    # call the 10-fold cross validation
    ten_fold_strat_cross_validation(knn, ten_folds_strat_list, 10)
    ten_fold_strat_cross_validation(naive_bayes, ten_folds_strat_list)
Beispiel #7
0
def runTests(trainning, tr_classes, test, t_classes, k):
    errors = 0
    for i in range(len(test)):
        c = knn.kNN(k, test[i], trainning, tr_classes)
        if (c != t_classes[i]):
            #print("Classification: " + str(c) + " Correct answer: " + str(t_classes[i]))
            errors = errors + 1

    print("######## K = " + str(k) + " ########")
    print("Dataset size: " + str(len(trainning) + len(test)))
    print("Trainning set size: " + str(len(trainning)))
    print("Test set size: " + str(len(test)))
    print("Errors: " + str(errors))
    print("Accuracy: " + str(1 - float(errors) / len(test)))
    print("\n")

    return (1 - errors / len(test))
Beispiel #8
0
def nca_mnist_experiment(trial, train_percentage=0.1, test_percentage=0.1):

    encoding_train_imgs_path = './data/MNIST_encoding/tf_train.encoding'
    encoding_test_imgs_path = './data/MNIST_encoding/tf_test.encoding'

    train_labels_path = './data/MNIST_encoding/tf_train.labels'
    test_labels_path = './data/MNIST_encoding/tf_test.labels'

    encoding_train = pickle.load(open(encoding_train_imgs_path, 'rb'))
    encoding_test = pickle.load(open(encoding_test_imgs_path, 'rb'))

    print(encoding_train.shape)

    train_labels = pickle.load(open(train_labels_path, 'rb'))
    test_labels = pickle.load(open(test_labels_path, 'rb'))

    print(train_labels.shape)

    m = len(encoding_train)
    train_m = int(m * train_percentage)
    sel = random.sample(range(m), train_m)
    X = encoding_train.astype(np.float)[sel]
    y = train_labels[sel]

    print(X.shape)
    print(y.shape)

    m = len(encoding_test)
    test_m = int(m * test_percentage)
    sel = random.sample(range(m), test_m)

    X_test = encoding_test.astype(np.float)[sel]
    y_test = test_labels[sel]

    print(X_test.shape)
    print(y_test.shape)

    knn = kNN()
    k_valus = [1, 3, 5, 7]
    for k in k_valus:
        knn.k = k

        acc_list = []
        for _ in range(trial):
            acc = knn.evaluate(X, y, X_test, y_test)
            acc_list.append(acc)

        print(np.mean(np.array(acc_list)))

    nca = NCA(max_iter=100, learning_rate=0.01)
    nca.fit(X, y)
    x_train = nca.transform()
    x_test = nca.transform(X_test)

    for k in k_valus:
        knn.k = k

        acc_list = []
        for _ in range(trial):
            acc = knn.evaluate(x_train, y, x_test, y_test)
            acc_list.append(acc)

        print(np.mean(np.array(acc_list)))
Beispiel #9
0
import datetime
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from load_data import *
from knn import kNN
from plot import plot


if __name__ == "__main__":
    train_set = load_data(10)
    plot(train_set)
    new_data = ['ZL', 169, 2]

    train_data = np.array(train_set[['打斗镜头', '接吻镜头']])
    train_labels = np.array(train_set[['电影类别']])

    time_s = datetime.datetime.now()
    # ===========================手动实现======================
    label = kNN(new_data[1:], train_data, train_labels, k=3)
    time_e = datetime.datetime.now() - time_s
    print('用时:', time_e)
    print('新数据的类别:', label)
    # ===========================sklearn实现======================
    clf = KNeighborsClassifier(n_neighbors=3)
    clf.fit(train_data, train_labels)
    label = clf.predict([new_data[1:]])  # 输入是2D数据
    time_e = datetime.datetime.now() - time_s
    print('用时:', time_e)
    print('新数据的类别:', label[0])
Beispiel #10
0
# -*- coding: utf-8 -*-

import f_test as ftest
import knn as knn
import centroid as cc
import lr as lr
import svm as svm

if __name__ == '__main__':
    file_name = 'GenomeTrainXY.txt'
    raw_data = ftest.get_data(file_name)
    features, scores = ftest.f_test(raw_data)
    ftest.print_scores(features, scores)
    train = knn.pickTrainingData(file_name, features)
    test = knn.pickTestData("GenomeTestX.txt", features)
    print("\n\nPredictions for KNN (k=3) Classifier: ")
    knn.kNN(3, train, test)
    print("\nPredictions for Centroid Classifier: ")
    cc.centroid_classifier(train, test)
    print("\nPredictions for Linear Regression: ")
    lr.linear_regression(train, test)
    print("\nPredictions for SVM: ")
    svm.svm_classifier(train, test)
Beispiel #11
0
def main_worker(gpu, ngpus_per_node, args):
    args.gpu = gpu

    # suppress printing if not master
    if args.multiprocessing_distributed and args.gpu != 0:

        def print_pass(*args):
            pass

        builtins.print = print_pass

    if args.gpu is not None:
        print("Use GPU: {} for training".format(args.gpu))

    if args.distributed:
        if args.dist_url == "env://" and args.rank == -1:
            args.rank = int(os.environ["RANK"])
        if args.multiprocessing_distributed:
            # For multiprocessing distributed training, rank needs to be the
            # global rank among all the processes
            args.rank = args.rank * ngpus_per_node + gpu
        dist.init_process_group(backend=args.dist_backend,
                                init_method=args.dist_url,
                                world_size=args.world_size,
                                rank=args.rank)
    # create model
    print("=> creating model '{}'".format(args.arch))
    model = moco.builder.MoCo(
        # models.__dict__[args.arch],
        netalexnet.alexnet,
        args.moco_dim,
        args.moco_k,
        args.moco_m,
        args.moco_t,
        args.mlp)
    print(model)

    if args.distributed:
        # For multiprocessing distributed, DistributedDataParallel constructor
        # should always set the single device scope, otherwise,
        # DistributedDataParallel will use all available devices.
        if args.gpu is not None:
            torch.cuda.set_device(args.gpu)
            model.cuda(args.gpu)
            # When using a single GPU per process and per
            # DistributedDataParallel, we need to divide the batch size
            # ourselves based on the total number of GPUs we have
            args.batch_size = int(args.batch_size / ngpus_per_node)
            args.workers = int(
                (args.workers + ngpus_per_node - 1) / ngpus_per_node)
            model = torch.nn.parallel.DistributedDataParallel(
                model, device_ids=[args.gpu])
        else:
            model.cuda()
            # DistributedDataParallel will divide and allocate batch_size to all
            # available GPUs if device_ids are not set
            model = torch.nn.parallel.DistributedDataParallel(model)
    elif args.gpu is not None:
        torch.cuda.set_device(args.gpu)
        model = model.cuda(args.gpu)
        # comment out the following line for debugging
        raise NotImplementedError("Only DistributedDataParallel is supported.")
    else:
        # AllGather implementation (batch shuffle, queue update, etc.) in
        # this code only supports DistributedDataParallel.
        raise NotImplementedError("Only DistributedDataParallel is supported.")

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda(args.gpu)

    optimizer = torch.optim.SGD(model.parameters(),
                                args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            if args.gpu is None:
                checkpoint = torch.load(args.resume)
            else:
                # Map model to be loaded to specified single gpu.
                loc = 'cuda:{}'.format(args.gpu)
                checkpoint = torch.load(args.resume, map_location=loc)
            args.start_epoch = checkpoint['epoch']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    cudnn.benchmark = True

    # Data loading code
    traindir = os.path.join(args.data, 'train')
    testdir = os.path.join(args.data, 'val')
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
    transform_train = transforms.Compose([
        transforms.Resize(size=256),
        transforms.RandomResizedCrop(size=224, scale=(0.2, 1.)),
        transforms.ColorJitter(0.4, 0.4, 0.4, 0.4),
        transforms.RandomGrayscale(p=0.2),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465),
                             (0.2023, 0.1994, 0.2010)),
    ])

    transform_test = transforms.Compose([
        transforms.Resize(size=256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465),
                             (0.2023, 0.1994, 0.2010)),
    ])

    train_dataset = ImageFolderInstance(
        '/data2/zyf/ImageNet/ILSVRC2012-100/train',
        transform=transform_train,
        two_crop=True)

    if args.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(
            train_dataset)
    else:
        train_sampler = None

    # train_loader = torch.utils.data.DataLoader(
    #     train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
    #     num_workers=args.workers, pin_memory=True, sampler=train_sampler, drop_last=True)
    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=True,
                                               num_workers=args.workers,
                                               drop_last=True)

    test_dataset = ImageFolderInstance(
        '/data2/zyf/ImageNet/ILSVRC2012-100/val', transform=transform_test)
    test_loader = torch.utils.data.DataLoader(test_dataset,
                                              batch_size=100,
                                              shuffle=False,
                                              num_workers=args.workers,
                                              drop_last=True)

    ndata = train_dataset.__len__()
    print(ndata)

    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            train_sampler.set_epoch(epoch)
        adjust_learning_rate(optimizer, epoch, args)

        # print('*******************')
        # acc = kNN(0, model, train_loader, test_loader, 200, 0.1, ndata, low_dim=128)
        # print('+++++++++++++++++')
        # train for one epoch
        train(train_loader, model, criterion, optimizer, epoch, args)

        print('----------Evaluation---------')
        start = time.time()
        acc = kNN(0,
                  model,
                  train_loader,
                  test_loader,
                  200,
                  0.1,
                  ndata,
                  low_dim=128)
        print("Evaluation Time: '{}'s".format(time.time() - start))

        writer.add_scalar('nn_acc', acc, epoch)

        if not args.multiprocessing_distributed or (
                args.multiprocessing_distributed
                and args.rank % ngpus_per_node == 0):
            save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'arch': args.arch,
                    'state_dict': model.state_dict(),
                    'optimizer': optimizer.state_dict(),
                },
                is_best=False,
                filename='checkpoint_{:04d}.pth.tar'.format(epoch))

    writer.close()
Beispiel #12
0
# -*- coding: utf-8 -*-

import numpy as np
import cv2 as cv

import Dataset
import knn

path = "ordo_2.csv"
DS = Dataset.Dataset(path)
df = DS.getDF()
print(df.head())
X, Y = DS.getXY()
print(X)
k = 7
#gnb = GNB.GNB(X, Y)
knn = knn.kNN(k, X, Y)
accuracy = knn.getAccuracy()
print(accuracy)
Beispiel #13
0
    plt.ylabel("m")
    plt.title("{} Sample Complexity".format(title))
    plt.show()


if __name__ == "__main__":
    # run search for all algorithms
    A = [
        "Perceptron()",
        "Winnow()",
        "LinearRegression()",
    ]
    Atitle = [
        "Perceptron", "Winnow", "Least Squares", "One nearest-neighbours"
    ]
    for j, i in enumerate(A):
        alg = eval(i)
        neg = -1
        if i == "Winnow()":
            neg = 0
        mean, std = find_trend_m(alg, neg=neg)
        plot_trend(mean, std, Atitle[j])

    alg = kNN()
    mean, std = find_trend_m(alg,
                             neg=neg,
                             num_runs=10,
                             test_size=6000,
                             max_n=18)
    plot_trend(mean, std, 'OneNN')
Beispiel #14
0
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
cmap = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])

iris = datasets.load_iris()
X, y = iris['data'], iris['target']
# print(iris['target_names']) ['setosa' 'versicolor' 'virginica']

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=1234)

from knn import kNN

clf = kNN(k=5)

clf.fit(X_train, y_train)

prediction = clf.predict(X_test)
acc = np.sum(prediction == y_test) / len(y_test)
print(acc)

# plt.figure()
# plt.scatter(X[:, 2], X[:, 3], c=y, cmap=cmap, edgecolor='k', s=20)
# plt.show()
Beispiel #15
0
def q2_regularised(X, y, n, k_list):
    """
    For n different 5-fold test/train splits, calculate
    the classification accuracy on the test set using
    the values in k_list

    Returns
    statistics: n by len(k_list) array of the mean and std
        test errors for each x-validated run. 
    k_max_index: list[int]: indexes of the most performant
        hyperparameters k for each one (use to derive optimal
        hyperparams from k_list)
    """
    accuracy = np.zeros((n, len(k_list), 5))
    for i in range(n):
        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
        l = 0
        for train_index, test_index in skf.split(X, y):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            for j, k in enumerate(k_list):
                classifier = kNN()
                classifier.train(X_train, y_train)
                y_pred = classifier.predict(X_test, k)
                acc = (y_pred == y_test).mean()
                accuracy[i, j, l] = acc
                print("Q2 -- Run {}, Fold {}, k: {}/{}, Accuracy: {}".format(
                    i + 1, l + 1, j + 1, len(k_list), acc))
            l += 1
    statistics = np.zeros((2, len(k_list)))
    statistics[0, :] = np.mean(accuracy, axis=(0, 2))
    statistics[1, :] = np.std(accuracy, axis=(0, 2))

    k_stats = np.mean(accuracy, axis=2)
    # Selecting most performant K parameter index
    k_max_index = np.argmax(k_stats, axis=1)
    print(k_max_index)
    k_mean = np.mean(accuracy, axis=2)
    k_max_index = np.argmax(k_mean, axis=1)
    k_std = np.std(accuracy, axis=2)
    # Selecting most performant K parameter
    optimal_params = [k_list[i] for i in k_max_index]
    print(optimal_params)
    optimal_errors = []
    # Rerun classification using most optimal parameter to find smallest error
    for i, k in enumerate(optimal_params):
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=i,
                                                            stratify=y)
        classifier = kNN()
        classifier.train(X_train, y_train)
        y_pred = classifier.predict(X_test, k)
        error = (y_pred == y_test).mean()
        print("K : {}, Error : {}".format(k, error))
        optimal_errors.append(error)

    optimal_mean = np.mean(optimal_errors)
    optimal_std = np.std(optimal_errors)
    print(optimal_errors)
    print(k_max_index)
    print("Optimal Mean Error: {} Optimal Std Error: {}".format(
        optimal_mean, optimal_std))

    return statistics, k_max_index
Beispiel #16
0
# projected training matrix
productMatrix = knnTrainingMatrix.dot(projMatrix)

# get test matrix
knnTestData = open('data/knntest.txt', 'r').readlines()
knnTestMatrix = []
for line in knnTestData:
    knnTestMatrix.append(np.fromstring(line, dtype=int, sep=' '))
knnTestMatrix = np.array(knnTestMatrix)

# normal kNN test error
numErrors = 0
for row in knnTestMatrix:
    vector = row[:-1]
    predictedLabel = kNN(vector, knnTrainingMatrix, labels, 15)
    if predictedLabel != row[-1]:
        numErrors = numErrors + 1
print(float(numErrors) / len(knnTestMatrix))

# projected kNN test error
numErrors = 0
for row in knnTestMatrix:
    vector = row[:-1]
    projVector = vector.dot(projMatrix)
    predictedLabel = kNN(projVector, productMatrix, labels, 15)
    if predictedLabel != row[-1]:
        numErrors = numErrors + 1
print(float(numErrors) / len(knnTestMatrix))

# ID3 Example Use
Beispiel #17
0
 def modelKNN(self, instanceFeature, k):
     """Performs a kNN on the model data and returns a dictionary of the vote
     proportions for the k nearest instances."""
     return knn.kNN(self.data, instanceFeature, k)