Beispiel #1
0
 def test_arange(self):
     train = np.arange(150).reshape(5, -1)
     test = np.square(np.arange(2, 122)).reshape(4, -1)
     knn = KNearestNeighbor()
     knn.train(train, None)
     d_two = knn.compute_distances_two_loops(test)
     d_one = knn.compute_distances_one_loop(test)
     d_no = knn.compute_distances_no_loops(test)
     self.assertAlmostEqual(0, np.linalg.norm(d_two - d_one, ord='fro'))
     self.assertAlmostEqual(0, np.linalg.norm(d_no - d_one, ord='fro'))
Beispiel #2
0
def Cross_validation(X_train, y_train):
    """交叉验证,确定超参K,同时可视化K值

    :param X_train: 训练集
    :param y_train: 训练标签
    """
    num_folds = 5
    k_choices = [1, 3, 5, 8, 10, 12, 15, 20, 50, 100]
    k_accuracy = {}
    # 将数据集分为5份
    X_train_folds = np.array_split(X_train, num_folds)
    y_train_folds = np.array_split(y_train, num_folds)
    # 计算每种K值
    for k in k_choices:
        k_accuracy[k] = []
        # 每个K值分别计算每份数据集作为测试集时的正确率
        for index in range(num_folds):
            # 构建数据集
            X_te = X_train_folds[index]
            y_te = y_train_folds[index]
            X_tr = np.reshape(
                X_train_folds[:index] + X_train_folds[index + 1:],
                (X_train.shape[0] * (num_folds - 1) / num_folds, -1))
            y_tr = np.reshape(
                y_train_folds[:index] + y_train_folds[index + 1:],
                (X_train.shape[0] * (num_folds - 1) / num_folds))
            # 预测结果
            classify = KNearestNeighbor()
            classify.train(X_tr, y_tr)
            y_te_pred = classify.predict(X_te, k=k)
            accuracy = np.sum(y_te_pred == y_te) / float(X_te.shape[0])
            k_accuracy[k].append(accuracy)

    for k, accuracylist in k_accuracy.items():
        for accuracy in accuracylist:
            print("k = %d, accuracy = %.3f" % (k, accuracy))

    # 可视化K值效果
    for k in k_choices:
        accuracies = k_accuracy[k]
        plt.scatter([k] * len(accuracies), accuracies)
    accuracies_mean = np.array(
        [np.mean(v) for k, v in sorted(k_accuracy.items())])
    accuracies_std = np.array(
        [np.std(v) for k, v in sorted(k_accuracy.items())])
    # 根据均值和方差构建误差棒图
    plt.errorbar(k_choices, accuracies_mean, yerr=accuracies_std)
    plt.title('Cross-validation on k')
    plt.xlabel('k')
    plt.ylabel('Cross-validation accuracy')
    plt.show()
Beispiel #3
0
import sklearn

if __name__ == '__main__':
    train_path = "/Users/zxj/Desktop/Mini1/train.pkl"
    train_data = pickle.load(open(train_path, "rb"))

    # Fixed_parameters
    # Please do not change the fixed parameters

    val_ratio = 0.2

    # student_parameters
    # You may want to change these in your experiment later.
    train_ratio = 1.0  # we split the train_data into 0.8:training

    train_num = int(train_data['data'].shape[0] * train_ratio *
                    (1.0 - val_ratio))
    val_num = -1 * int(train_data['data'].shape[0] * train_ratio * val_ratio)
    KNN_classifier = KNearestNeighbor()
    KNN_classifier.train(train_data['data'][:train_num],
                         train_data['target'][:train_num])
    dists = KNN_classifier.compute_distances(train_data['data'][val_num:, :])
    k_choices = [2, 3, 5, 7, 9, 11, 15, 19]
    for k in k_choices:
        y_test_pred = KNN_classifier.predict_labels(dists, k)

        num_correct = np.sum(y_test_pred == train_data['target'][val_num:])
        accuracy = float(num_correct) / (-1 * val_num)
        print(
            'For K= %d and train_ratio= %f, Got %d / %d correct => VAL_accuracy: %f'
            % (k, train_ratio, num_correct, -1 * val_num, accuracy))
def run_model_with_cross_validation(model_name, knn_mode, k_number):

	#GET DATA
	#- expect data_0 ... data_4
	data_groups = list()
	data_groups.append(np.loadtxt('data_0', delimiter=','))
	data_groups.append(np.loadtxt('data_1', delimiter=','))
	data_groups.append(np.loadtxt('data_2', delimiter=','))
	data_groups.append(np.loadtxt('data_3', delimiter=','))
	data_groups.append(np.loadtxt('data_4', delimiter=','))

	NUM_GROUPS = len(data_groups)

	#For each data_group, train on all others and test on me
	culminating_result = 0;

	for test_group_id in range(NUM_GROUPS):

		#Form training data as 4/5 data
		train_data = np.array([])
		for train_group_id in range(len(data_groups)):
			if (train_group_id != test_group_id):
				#Initialize train_data if necessary
				if (train_data.size == 0):
					train_data = np.copy(data_groups[train_group_id])
				else:
					train_data = np.concatenate(
							(train_data, data_groups[train_group_id]), axis=0)

		print('train_data, group ', str(test_group_id), 'length: ', len(train_data))
		print(train_data)

		test_data = data_groups[test_group_id]

		result = 0
		model = None
		if (model_name == 'knn'):
			model = KNearestNeighbor(train_data, k_number)
			model.train(train_data)
			print('KNN train data length', len(model.data))
			result = model.test(test_data, knn_mode)
		elif (model_name == 'c_knn'):
			model = CondensedKNearestNeighbor(train_data, k_number)
			#Mode is always majority...this is not used for regression
			mode = "majority"
			model.train(train_data)
			print('condensed KNN train data length', len(model.data))
			result = model.test(test_data, mode)
		else:
			print('error - ', model_name, ' is not a supported model')
			return

		print('test_data, group ', str(test_group_id), 'length:', len(test_data))
		print(test_data)

		print()
		print('result of iteration ' + str(test_group_id))
		print(result)
		print()

		culminating_result = culminating_result + result
	
	final_average_result = culminating_result / NUM_GROUPS
	print()
	print('final average result:')
	print(final_average_result)
	print()

	return final_average_result
Beispiel #5
0
    data_file = open(test[0])
    print("Running with %s" % test[0])
    for line in data_file:
        line_split = line.split(',')
        data_instances.append(map(float, line_split))
    data_instances = np.array(data_instances)
    np.random.shuffle(data_instances)

    # 5 fold cross validation
    learner_type = "CLASSIFICATION"
    fold_size = data_instances.shape[0] / 5
    data_indices = [idx for idx in range(data_instances.shape[0])]
    for k in range(1, 100, 5):
        total_performance = 0.0
        for holdout_fold_idx in range(5):
            kNN_model = KNearestNeighbor(k, learner_type)
            kNN_model.train(data_instances[ \
                    np.array( \
                        np.setdiff1d(data_indices, data_indices[ \
                                fold_size * holdout_fold_idx : \
                                fold_size * holdout_fold_idx + fold_size]))])
            kNN_model.condense_training_data()
            #  predict test data using k-NN and average performance
            predictions = kNN_model.predict( \
                data_instances[ \
                    fold_size * holdout_fold_idx : \
                    fold_size * holdout_fold_idx + fold_size])
            successes = fold_size - \
                sum(abs(
                   predictions - \
                   data_instances[
Beispiel #6
0
print 'Test data new shape: ', X_test.shape

# Create a kNN classifier instance.
# Remember that training a kNN classifier is a noop:
# the Classifier simply remembers the data and does no further processing
# classifier = KNearestNeighbor()
# classifier.train(X_train, y_train)

# numK = [8,9,10,11,12,13,14,15,16]
numK = [12]
results = {}
bestValAcc = 0
bestK = None

for num in numK:
    knn = KNearestNeighbor()
    knn.train(X_train, y_train)
    y_train_pred = knn.predict(X_train, k=num)
    y_val_pred = knn.predict(X_val, k=num)
    trainAcc = np.mean(y_train == y_train_pred)
    valAcc = np.mean(y_val == y_val_pred)
    print 'k: %d train accuracy: %.4f val accuracy: %.4f' % (num, trainAcc,
                                                             valAcc)
    if valAcc > bestValAcc:
        bestValAcc = valAcc
        bestK = num

print 'best validation accuracy achieved: %.4f, with best k : %d' % (
    bestValAcc, bestK)

# Based on the cross-validation results above, choose the best value for k,
Beispiel #7
0
mask=list(range(num_training))
X_train=X_train[mask]
y_train=y_train[mask]
num_test=500
mask=list(range(num_test))
X_test=X_test[mask]
y_test=y_test[mask]

#Reshape the image data into rows
X_train=np.reshape(X_train,(X_train.shape[0],-1))
X_test=np.reshape(X_test,(X_test.shape[0],-1))
print(X_train.shape,X_test.shape)
#Create a KNN classifier instance,k=1


classifier=KNearestNeighbor()
classifier.train(X_train,y_train)
"""
dists=classifier.compute_distance_two_loops(X_test)
print('dists.shape is')
print(dists.shape)
#plt.imshow(dists,interpolation='none')
#plt.savefig('/home/hongyin/file/cs231n-assignment1/picFaster.jpg')
y_test_pred=classifier.predict_labels(dists,k=1)
num_correct=np.sum(y_test_pred==y_test)
accuracy=float(num_correct)/num_test
print('Got %d / %d correct => accuracy: %f' % (num_correct, num_test, accuracy))


#k=5
y_test_pred = classifier.predict_labels(dists, k=5)
Beispiel #8
0
# 可视化图像
VisualizeImage(X_train, y_train)
input('Enter any key to Cross-validation...')

# 创建用于超参数调优的交叉验证集(也可以验证集,因为数据量还是很大的)
num_training = 5000
X_tr = X_train[:num_training, ::]
X_tr = np.reshape(X_tr, (X_tr.shape[0], -1))
y_tr = y_train[:num_training]
# print(X_tr.shape, y_tr.shape)

num_testing = 500
X_te = X_test[:num_testing, ::]
X_te = np.reshape(X_te, (X_te.shape[0], -1))
y_te = y_test[:num_testing]
# print(X_te.shape, y_te.shape)

# 交叉验证确定参数K
Cross_validation(X_tr, y_tr)
input('Enter any key to train model...')

# 训练完整数据集(这里就以5000个数据集作为完整训练集,500个数据集作为测试集(60000个数据电脑内存吃不消), k值根据图显示10应为最佳)
classify = KNearestNeighbor()
classify.train(X_tr, y_tr)
y_te_pred = classify.predict(X_te, k=10)
accuracy = np.sum(y_te_pred == y_te) / float(X_te.shape[0])
print('最终测试: '
      '     K = %d, accuracy = %.3f' % (10, accuracy))


Beispiel #9
0
                                fold_size * holdout_fold_idx + fold_size]))])
            centroids = kmeans_model.get_centroids()
            for cluster_idx in xrange(len(clusters)):
                ave_label = 0.0
                for instance in clusters[cluster_idx]:
                    ave_label += instance[-1]
                if len(clusters[cluster_idx]) > 0:
                    ave_label = ave_label / len(clusters[cluster_idx])
                if learner_type == "CLASSIFICATION":
                    ave_label = int(round(ave_label))
                centroids[cluster_idx].append(ave_label)

            #     for classification, vote to determine centroid classification
            #     for regression, average to find centroid estimate
            #  feed centroids into k-NN as training data
            kNN_model = KNearestNeighbor(best_ks[test[0]], learner_type)
            kNN_model.train(centroids)
            #  predict test data using k-NN and average performance
            predictions = kNN_model.predict( \
                data_instances[ \
                    fold_size * holdout_fold_idx : \
                    fold_size * holdout_fold_idx + fold_size])
            if kNN_model.learner_type == "CLASSIFICATION":
                successes = fold_size - \
                    sum(abs(
                        predictions - \
                        data_instances[
                            fold_size * holdout_fold_idx :
                            fold_size * holdout_fold_idx + fold_size,-1]))
                performance = successes / fold_size
            elif kNN_model.learner_type == "REGRESSION":
Beispiel #10
0
        file_name = train_files[i]
        file_str = file_name.split('.')[0]
        class_num = int(file_str.split('_')[0])
        digits_labels.append(class_num)
        training_mat[i, :] = img2vector('../../data/digits/trainingDigits/%s' %
                                        file_name)
    # 使用训练集进行训练
    classifier.fit(training_mat, digits_labels)

    test_files = os.listdir('../../data/digits/testDigits')
    err_cnt = 0
    m_test = len(test_files)
    for i in range(m_test):
        file_name = test_files[i]
        file_str = file_name.split('.')[0]
        class_num = int(file_str.split('_')[0])
        vec_under_test = img2vector('../../data/digits/testDigits/%s' %
                                    file_name)
        classifier_result = classifier.predict(vec_under_test)
        print('the classifier came back with: %d, the real answer is: %d' %
              (classifier_result, class_num))
        if classifier_result != class_num:
            err_cnt += 1
    print('the total number of errors is: %d' % err_cnt)
    print('the total error rate is: %f' % (err_cnt / float(m_test)))


if __name__ == '__main__':
    knn = KNearestNeighbor(k=10)
    digits_classify(classifier=knn)
Beispiel #11
0
# subsample train data. array indexing
X_train = X_train[mask]
y_train = y_train[mask]

num_test = 500
mask = range(num_test)
# subsample test data. array indexing
X_test = X_test[mask]
y_test = y_test[mask]

# reshape data to rows
X_train = np.reshape(X_train, (X_train.shape[0], -1))
X_test = np.reshape(X_test, (X_test.shape[0], -1))

# initial classifer object
knn_classifer = KNearestNeighbor()
# train with training set
# knn_classifer.train(X_train, y_train)
"""
# compute distances for test set
"""
# dists = knn_classifer.compute_distances_two_loops(X_test)
# dists_one_loop = knn_classifer.compute_distances_one_loop(X_test)
# dists_no_loop = knn_classifer.compute_distances_no_loops(X_test)
"""
# test one loop computation
"""
# difference = np.linalg.norm(dists - dists_one, ord='fro')
# print('One loop Difference was: %f' % (difference, ))
"""
test no loop computation