Beispiel #1
0
def Cross_validation(X_train, y_train):
    """交叉验证,确定超参K,同时可视化K值

    :param X_train: 训练集
    :param y_train: 训练标签
    """
    num_folds = 5
    k_choices = [1, 3, 5, 8, 10, 12, 15, 20, 50, 100]
    k_accuracy = {}
    # 将数据集分为5份
    X_train_folds = np.array_split(X_train, num_folds)
    y_train_folds = np.array_split(y_train, num_folds)
    # 计算每种K值
    for k in k_choices:
        k_accuracy[k] = []
        # 每个K值分别计算每份数据集作为测试集时的正确率
        for index in range(num_folds):
            # 构建数据集
            X_te = X_train_folds[index]
            y_te = y_train_folds[index]
            X_tr = np.reshape(
                X_train_folds[:index] + X_train_folds[index + 1:],
                (X_train.shape[0] * (num_folds - 1) / num_folds, -1))
            y_tr = np.reshape(
                y_train_folds[:index] + y_train_folds[index + 1:],
                (X_train.shape[0] * (num_folds - 1) / num_folds))
            # 预测结果
            classify = KNearestNeighbor()
            classify.train(X_tr, y_tr)
            y_te_pred = classify.predict(X_te, k=k)
            accuracy = np.sum(y_te_pred == y_te) / float(X_te.shape[0])
            k_accuracy[k].append(accuracy)

    for k, accuracylist in k_accuracy.items():
        for accuracy in accuracylist:
            print("k = %d, accuracy = %.3f" % (k, accuracy))

    # 可视化K值效果
    for k in k_choices:
        accuracies = k_accuracy[k]
        plt.scatter([k] * len(accuracies), accuracies)
    accuracies_mean = np.array(
        [np.mean(v) for k, v in sorted(k_accuracy.items())])
    accuracies_std = np.array(
        [np.std(v) for k, v in sorted(k_accuracy.items())])
    # 根据均值和方差构建误差棒图
    plt.errorbar(k_choices, accuracies_mean, yerr=accuracies_std)
    plt.title('Cross-validation on k')
    plt.xlabel('k')
    plt.ylabel('Cross-validation accuracy')
    plt.show()
Beispiel #2
0
 def test_arange(self):
     train = np.arange(150).reshape(5, -1)
     test = np.square(np.arange(2, 122)).reshape(4, -1)
     knn = KNearestNeighbor()
     knn.train(train, None)
     d_two = knn.compute_distances_two_loops(test)
     d_one = knn.compute_distances_one_loop(test)
     d_no = knn.compute_distances_no_loops(test)
     self.assertAlmostEqual(0, np.linalg.norm(d_two - d_one, ord='fro'))
     self.assertAlmostEqual(0, np.linalg.norm(d_no - d_one, ord='fro'))
 def test_arange(self):
     train = np.arange(150).reshape(5, -1)
     test = np.square(np.arange(2, 122)).reshape(4, -1)
     knn = KNearestNeighbor()
     knn.train(train, None)
     d_two = knn.compute_distances_two_loops(test)
     d_one = knn.compute_distances_one_loop(test)
     d_no = knn.compute_distances_no_loops(test)
     self.assertAlmostEqual(0, np.linalg.norm(d_two - d_one, ord='fro'))
     self.assertAlmostEqual(0, np.linalg.norm(d_no - d_one, ord='fro'))
Beispiel #4
0
        for i, idx in enumerate(idxs):
            plt_idx = i * class_num + y + 1
            plt.subplot(samples_pre_class, class_num, plt_idx)
            plt.imshow(X_train[idx].astype('uint8'))
            plt.axis('off')
            if i == 0:
                plt.title(cls)

    plt.show()

    X_train = X_train.reshape(500, -1)
    y_train = y_train.reshape(500, -1)
    X_test = X_test.reshape(10, -1)
    y_test = y_test.reshape(10, -1)

    classifier = KNearestNeighbor()
    classifier.train(X_train, y_train)
    dists = classifier.compute_distance_two_loops(X_test)
    dists_one = classifier.compute_distance_one_loop(X_test)

    diff = np.linalg.norm(dists - dists_one, ord='fro')
    if diff < 0.001:
        print('good')
    else:
        print('bad')
    y_pred = classifier.predict_labels(dists, 1)
    correct = np.where(y_pred == y_train)
    print('accuracy: ', len(correct) / len(y_test))

    # cross validation
    num_folds = 5
Beispiel #5
0
print 'Test data new shape: ', X_test.shape

# Create a kNN classifier instance.
# Remember that training a kNN classifier is a noop:
# the Classifier simply remembers the data and does no further processing
# classifier = KNearestNeighbor()
# classifier.train(X_train, y_train)

# numK = [8,9,10,11,12,13,14,15,16]
numK = [12]
results = {}
bestValAcc = 0
bestK = None

for num in numK:
    knn = KNearestNeighbor()
    knn.train(X_train, y_train)
    y_train_pred = knn.predict(X_train, k=num)
    y_val_pred = knn.predict(X_val, k=num)
    trainAcc = np.mean(y_train == y_train_pred)
    valAcc = np.mean(y_val == y_val_pred)
    print 'k: %d train accuracy: %.4f val accuracy: %.4f' % (num, trainAcc,
                                                             valAcc)
    if valAcc > bestValAcc:
        bestValAcc = valAcc
        bestK = num

print 'best validation accuracy achieved: %.4f, with best k : %d' % (
    bestValAcc, bestK)

# Based on the cross-validation results above, choose the best value for k,
Beispiel #6
0
y_train = y_train.reshape(-1, 1)

x_train_folds = np.array_split(x_train, num_folds)
y_train_folds = np.array_split(y_train, num_folds)

x_train_folds = np.array(x_train_folds)
y_train_folds = np.array(y_train_folds)

k_to_accuracies = {}

for k in k_choices:
    k_to_accuracies.setdefault(k, [])

for i in range(num_folds):
    classifier = KNearestNeighbor()
    x_val_train = np.concatenate((x_train_folds[0:i], x_train_folds[i + 1:]),
                                 axis=0)
    x_val_train = x_val_train.reshape(-1, x_val_train.shape[2])
    y_val_train = np.concatenate((y_train_folds[0:i], y_train_folds[i + 1:]),
                                 axis=0)
    y_val_train = y_val_train.reshape(-1, y_val_train.shape[2])

    y_val_train = y_val_train[:, 0]
    classifier.train(x_val_train, y_val_train)
    for k in k_choices:
        y_val_pred = classifier.predict_labels(x_train_folds[i], k=k)
        num_correct = np.sum(y_val_pred == y_train_folds[i][:, 0])
        accuracy = float(num_correct) / len(y_val_pred)
        k_to_accuracies[k] = k_to_accuracies[k] + [accuracy]
Beispiel #7
0
num_training = 5000
mask = range(num_training)
X_train = X_train[mask]
y_train = y_train[mask]

num_test = 500
mask = range(num_test)
X_test = X_test[mask]
y_test = y_test[mask]

X_train = np.reshape(X_train, (X_train.shape[0], -1))
X_test = np.reshape(X_test, (X_test.shape[0], -1))
print(X_train.shape, X_test.shape)

classifier = KNearestNeighbor()
classifier.train(X_train, y_train)

dists = classifier.compute_distances_no_loops(X_test)
print(dists.shape)

plt.imshow(dists, interpolation='none')
plt.show()

y_test_pred = classifier.predict_labels(dists, k=5)

# Compute and print the fraction of correctly predicted examples
num_correct = np.sum(y_test_pred == y_test)
accuracy = float(num_correct) / num_test
print('Got %d / %d correct => accuracy: %f' %
      (num_correct, num_test, accuracy))
Beispiel #8
0
X_train = X_train[mask]
y_train = y_train[mask]

num_test = 500
mask = range(num_test)
X_test = X_test[mask]
y_test = y_test[mask]

# reshape训练和测试数据,转换为行的形式
X_train = np.reshape(X_train, (X_train.shape[0], -1))
X_test = np.reshape(X_test, (X_test.shape[0], -1))

print(X_train.shape)
print(X_test.shape)

classifier = KNearestNeighbor()
classifier.train(X_train, y_train)

dists = classifier.compute_distances_two_loops(X_test)
print(dists.shape)

plt.imshow(dists, interpolation='none')
plt.show()

# Now implement the function predict_labels and run the code below:
# k=1时
y_test_pred = classifier.predict_labels(dists, k=1)

# Compute and print the fraction of correctly predicted examples
num_correct = np.sum(y_test_pred == y_test)
accuracy = float(num_correct) / num_test
Beispiel #9
0
import csv
from distance import manhattan_distance, geometric_distance
from datapoint import DataPoint
from k_nearest_neighbor import KNearestNeighbor
from math import *


def load_dataset(filename):
	print("loading file " + filename + "...")
	with open(filename, 'rb') as csv_file:
		return [DataPoint(item[1:], item[0]) for item in csv.reader(csv_file, delimiter=',')]

training_set = load_dataset('train.csv')
test_set = load_dataset('test.csv')

for test_item in test_set:
	knn = KNearestNeighbor(test_item, manhattan_distance, 3)
	for training_item in training_set:
		knn.check(training_item)

	correct = test_item.value == closest_neighbor.value
	if not correct:
		test_item.show()
		closest_neighbor.show()

Beispiel #10
0
cifar_10_dir = './cifar-10-batches-py'
x_train, y_train, x_test, y_test = load_cifar10(cifar_10_dir)
print('train_data_shape:', x_train.shape)
print('train_labels_shape:', y_train.shape)
print('test_data_shape:', x_test.shape)
print('test_labels_shape:', y_test.shape)

x_train = x_train.reshape(x_train.shape[0], -1)
x_test = x_test.reshape(x_test.shape[0], -1)
num_train = x_train.shape[0]
num_test = x_test.shape[0]

# num_train = 5000
# mask = range(num_train)
# x_train = x_train[mask]
# y_train = y_train[mask]
# num_test = 500
# mask = range(num_test)
# x_test = x_test[mask]
# y_test = y_test[mask]

classifier = KNearestNeighbor()
classifier.train(x_train, y_train)
dicts = classifier.compute_distance(x_test)
y_test_pred = classifier.predict_labels(dicts, k=10)

num_correct = np.sum(y_test_pred == y_test)
accuracy = num_correct / num_test
print('got %d / %d correct => accuracy: %f' %
      (num_correct, num_test, accuracy))
def run_model_with_cross_validation(model_name, knn_mode, k_number):

	#GET DATA
	#- expect data_0 ... data_4
	data_groups = list()
	data_groups.append(np.loadtxt('data_0', delimiter=','))
	data_groups.append(np.loadtxt('data_1', delimiter=','))
	data_groups.append(np.loadtxt('data_2', delimiter=','))
	data_groups.append(np.loadtxt('data_3', delimiter=','))
	data_groups.append(np.loadtxt('data_4', delimiter=','))

	NUM_GROUPS = len(data_groups)

	#For each data_group, train on all others and test on me
	culminating_result = 0;

	for test_group_id in range(NUM_GROUPS):

		#Form training data as 4/5 data
		train_data = np.array([])
		for train_group_id in range(len(data_groups)):
			if (train_group_id != test_group_id):
				#Initialize train_data if necessary
				if (train_data.size == 0):
					train_data = np.copy(data_groups[train_group_id])
				else:
					train_data = np.concatenate(
							(train_data, data_groups[train_group_id]), axis=0)

		print('train_data, group ', str(test_group_id), 'length: ', len(train_data))
		print(train_data)

		test_data = data_groups[test_group_id]

		result = 0
		model = None
		if (model_name == 'knn'):
			model = KNearestNeighbor(train_data, k_number)
			model.train(train_data)
			print('KNN train data length', len(model.data))
			result = model.test(test_data, knn_mode)
		elif (model_name == 'c_knn'):
			model = CondensedKNearestNeighbor(train_data, k_number)
			#Mode is always majority...this is not used for regression
			mode = "majority"
			model.train(train_data)
			print('condensed KNN train data length', len(model.data))
			result = model.test(test_data, mode)
		else:
			print('error - ', model_name, ' is not a supported model')
			return

		print('test_data, group ', str(test_group_id), 'length:', len(test_data))
		print(test_data)

		print()
		print('result of iteration ' + str(test_group_id))
		print(result)
		print()

		culminating_result = culminating_result + result
	
	final_average_result = culminating_result / NUM_GROUPS
	print()
	print('final average result:')
	print(final_average_result)
	print()

	return final_average_result
Beispiel #12
0
    data_file = open(test[0])
    print("Running with %s" % test[0])
    for line in data_file:
        line_split = line.split(',')
        data_instances.append(map(float, line_split))
    data_instances = np.array(data_instances)
    np.random.shuffle(data_instances)

    # 5 fold cross validation
    learner_type = "CLASSIFICATION"
    fold_size = data_instances.shape[0] / 5
    data_indices = [idx for idx in range(data_instances.shape[0])]
    for k in range(1, 100, 5):
        total_performance = 0.0
        for holdout_fold_idx in range(5):
            kNN_model = KNearestNeighbor(k, learner_type)
            kNN_model.train(data_instances[ \
                    np.array( \
                        np.setdiff1d(data_indices, data_indices[ \
                                fold_size * holdout_fold_idx : \
                                fold_size * holdout_fold_idx + fold_size]))])
            kNN_model.condense_training_data()
            #  predict test data using k-NN and average performance
            predictions = kNN_model.predict( \
                data_instances[ \
                    fold_size * holdout_fold_idx : \
                    fold_size * holdout_fold_idx + fold_size])
            successes = fold_size - \
                sum(abs(
                   predictions - \
                   data_instances[
Beispiel #13
0
	def __init__(self, data, k_number):
		KNearestNeighbor.__init__(self, data, k_number)
Beispiel #14
0
# 可视化图像
VisualizeImage(X_train, y_train)
input('Enter any key to Cross-validation...')

# 创建用于超参数调优的交叉验证集(也可以验证集,因为数据量还是很大的)
num_training = 5000
X_tr = X_train[:num_training, ::]
X_tr = np.reshape(X_tr, (X_tr.shape[0], -1))
y_tr = y_train[:num_training]
# print(X_tr.shape, y_tr.shape)

num_testing = 500
X_te = X_test[:num_testing, ::]
X_te = np.reshape(X_te, (X_te.shape[0], -1))
y_te = y_test[:num_testing]
# print(X_te.shape, y_te.shape)

# 交叉验证确定参数K
Cross_validation(X_tr, y_tr)
input('Enter any key to train model...')

# 训练完整数据集(这里就以5000个数据集作为完整训练集,500个数据集作为测试集(60000个数据电脑内存吃不消), k值根据图显示10应为最佳)
classify = KNearestNeighbor()
classify.train(X_tr, y_tr)
y_te_pred = classify.predict(X_te, k=10)
accuracy = np.sum(y_te_pred == y_te) / float(X_te.shape[0])
print('最终测试: '
      '     K = %d, accuracy = %.3f' % (10, accuracy))


Beispiel #15
0
                                fold_size * holdout_fold_idx + fold_size]))])
            centroids = kmeans_model.get_centroids()
            for cluster_idx in xrange(len(clusters)):
                ave_label = 0.0
                for instance in clusters[cluster_idx]:
                    ave_label += instance[-1]
                if len(clusters[cluster_idx]) > 0:
                    ave_label = ave_label / len(clusters[cluster_idx])
                if learner_type == "CLASSIFICATION":
                    ave_label = int(round(ave_label))
                centroids[cluster_idx].append(ave_label)

            #     for classification, vote to determine centroid classification
            #     for regression, average to find centroid estimate
            #  feed centroids into k-NN as training data
            kNN_model = KNearestNeighbor(best_ks[test[0]], learner_type)
            kNN_model.train(centroids)
            #  predict test data using k-NN and average performance
            predictions = kNN_model.predict( \
                data_instances[ \
                    fold_size * holdout_fold_idx : \
                    fold_size * holdout_fold_idx + fold_size])
            if kNN_model.learner_type == "CLASSIFICATION":
                successes = fold_size - \
                    sum(abs(
                        predictions - \
                        data_instances[
                            fold_size * holdout_fold_idx :
                            fold_size * holdout_fold_idx + fold_size,-1]))
                performance = successes / fold_size
            elif kNN_model.learner_type == "REGRESSION":
Beispiel #16
0
        file_name = train_files[i]
        file_str = file_name.split('.')[0]
        class_num = int(file_str.split('_')[0])
        digits_labels.append(class_num)
        training_mat[i, :] = img2vector('../../data/digits/trainingDigits/%s' %
                                        file_name)
    # 使用训练集进行训练
    classifier.fit(training_mat, digits_labels)

    test_files = os.listdir('../../data/digits/testDigits')
    err_cnt = 0
    m_test = len(test_files)
    for i in range(m_test):
        file_name = test_files[i]
        file_str = file_name.split('.')[0]
        class_num = int(file_str.split('_')[0])
        vec_under_test = img2vector('../../data/digits/testDigits/%s' %
                                    file_name)
        classifier_result = classifier.predict(vec_under_test)
        print('the classifier came back with: %d, the real answer is: %d' %
              (classifier_result, class_num))
        if classifier_result != class_num:
            err_cnt += 1
    print('the total number of errors is: %d' % err_cnt)
    print('the total error rate is: %f' % (err_cnt / float(m_test)))


if __name__ == '__main__':
    knn = KNearestNeighbor(k=10)
    digits_classify(classifier=knn)
Beispiel #17
0
# subsample train data. array indexing
X_train = X_train[mask]
y_train = y_train[mask]

num_test = 500
mask = range(num_test)
# subsample test data. array indexing
X_test = X_test[mask]
y_test = y_test[mask]

# reshape data to rows
X_train = np.reshape(X_train, (X_train.shape[0], -1))
X_test = np.reshape(X_test, (X_test.shape[0], -1))

# initial classifer object
knn_classifer = KNearestNeighbor()
# train with training set
# knn_classifer.train(X_train, y_train)
"""
# compute distances for test set
"""
# dists = knn_classifer.compute_distances_two_loops(X_test)
# dists_one_loop = knn_classifer.compute_distances_one_loop(X_test)
# dists_no_loop = knn_classifer.compute_distances_no_loops(X_test)
"""
# test one loop computation
"""
# difference = np.linalg.norm(dists - dists_one, ord='fro')
# print('One loop Difference was: %f' % (difference, ))
"""
test no loop computation
Beispiel #18
0
#Parameters:	


#a : array_like
#Array to be reshaped.

#newshape : int or tuple of ints
#The new shape should be compatible with the original shape. If an integer, then the result will be a 1-D array of that length. One shape dimension can be -1.

from k_nearest_neighbor  import KNearestNeighbor

# Create a kNN classifier instance. 
# Remember that training a kNN classifier is a noop: 
# the Classifier simply remembers the data and does no further processing 
classifier = KNearestNeighbor()
classifier.train(X_train, y_train)

#Open cs231n/classifiers/k_nearest_neighbor.py and implement
# compute_distances_two_loops.

# Test your implementation:
dists = classifier.compute_distances_two_loops(X_test)
print(dists.shape)

# We can visualize the distance matrix: each row is a single test example and
# its distances to training examples
plt.imshow(dists,interpolation='none')
plt.show()

# Now implement the function predict_labels and run the code below:
Beispiel #19
0
mask=list(range(num_training))
X_train=X_train[mask]
y_train=y_train[mask]
num_test=500
mask=list(range(num_test))
X_test=X_test[mask]
y_test=y_test[mask]

#Reshape the image data into rows
X_train=np.reshape(X_train,(X_train.shape[0],-1))
X_test=np.reshape(X_test,(X_test.shape[0],-1))
print(X_train.shape,X_test.shape)
#Create a KNN classifier instance,k=1


classifier=KNearestNeighbor()
classifier.train(X_train,y_train)
"""
dists=classifier.compute_distance_two_loops(X_test)
print('dists.shape is')
print(dists.shape)
#plt.imshow(dists,interpolation='none')
#plt.savefig('/home/hongyin/file/cs231n-assignment1/picFaster.jpg')
y_test_pred=classifier.predict_labels(dists,k=1)
num_correct=np.sum(y_test_pred==y_test)
accuracy=float(num_correct)/num_test
print('Got %d / %d correct => accuracy: %f' % (num_correct, num_test, accuracy))


#k=5
y_test_pred = classifier.predict_labels(dists, k=5)
Beispiel #20
0
import sklearn

if __name__ == '__main__':
    train_path = "/Users/zxj/Desktop/Mini1/train.pkl"
    train_data = pickle.load(open(train_path, "rb"))

    # Fixed_parameters
    # Please do not change the fixed parameters

    val_ratio = 0.2

    # student_parameters
    # You may want to change these in your experiment later.
    train_ratio = 1.0  # we split the train_data into 0.8:training

    train_num = int(train_data['data'].shape[0] * train_ratio *
                    (1.0 - val_ratio))
    val_num = -1 * int(train_data['data'].shape[0] * train_ratio * val_ratio)
    KNN_classifier = KNearestNeighbor()
    KNN_classifier.train(train_data['data'][:train_num],
                         train_data['target'][:train_num])
    dists = KNN_classifier.compute_distances(train_data['data'][val_num:, :])
    k_choices = [2, 3, 5, 7, 9, 11, 15, 19]
    for k in k_choices:
        y_test_pred = KNN_classifier.predict_labels(dists, k)

        num_correct = np.sum(y_test_pred == train_data['target'][val_num:])
        accuracy = float(num_correct) / (-1 * val_num)
        print(
            'For K= %d and train_ratio= %f, Got %d / %d correct => VAL_accuracy: %f'
            % (k, train_ratio, num_correct, -1 * val_num, accuracy))
Beispiel #21
0
num_test = 500
X_test = X_test[:num_test]
Y_test = Y_test[:num_test]

num_folds = 5
k_choices = [1, 3, 5, 8, 10, 12, 15, 20, 50, 100]

x_train_folds = []
y_train_folds = []

x_train_folds = np.array_split(X_train, num_folds)
y_train_folds = np.array_split(Y_train, num_folds)

k_to_accuracies = {}

classifier = KNearestNeighbor()
for k in k_choices:
    accuracies = np.zeros(num_folds)
    for fold in range(num_folds):
        temp_X = x_train_folds[:]
        temp_y = y_train_folds[:]
        x_validate_fold = temp_X.pop(fold)
        y_validate_fold = temp_y.pop(fold)

        temp_X = np.array([y for x in temp_X for y in x])
        temp_y = np.array([y for x in temp_y for y in x])
        classifier.train(temp_X, temp_y)

        y_test_pred = classifier.predict(x_validate_fold, k=k)
        num_correct = np.sum(y_test_pred == y_validate_fold)
        accuracy = float(num_correct) / num_test