def test_knn_classify_basic(self): factors = np.array([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1], [-1, -1], [-0.5, -1.1]]) labels = ['A', 'A', 'B', 'B', 'C', 'C'] prediction = knn.classify([1.1, 0.9], factors, labels, 3) self.failUnless(prediction == 'A') prediction = knn.classify([-0.1, 0.3], factors, labels, 3) self.failUnless(prediction == 'B') prediction = knn.classify([-0.7, -1.3], factors, labels, 3) self.failUnless(prediction == 'C')
def handwriteingClassTest(): hwLabels = [] trainingFileList = listdir('trainingDigits') m = len(trainingFileList) trainingMat = zeros((m,1024)) for i in range(m): fileNameStr = trainingFileList[i] fileStr = fileNameStr.split('.')[0] classNumStr = int(fileStr.split('_')[0]) hwLabels.append(classNumStr) trainingMat[i,:] = img2vector('trainingDigits/%s' % fileNameStr) testFileList = listdir('testDigits') errorCount = 0.0 mTest = len(testFileList) for i in range(mTest): fileNameStr = testFileList[i] fileStr = fileNameStr.split('.')[0] classNumStr = int(fileStr.split('_')[0]) vectorUnderTest = img2vector('testDigits/%s' % fileNameStr) classifierResult = classify(vectorUnderTest,trainingMat,hwLabels,3) print("the classifier came back with: %d, the real answer is : %d" % (classifierResult,classNumStr)) if classifierResult != classNumStr: errorCount += 1.0 print("The total number of errors is: %d." % errorCount) print("The total error rate is: %f." % (errorCount/float(mTest)))
def main(dataset_name, testset_name, new_emails=False): '''Runs the knn classifier for a training set dataset_name and test set testset_name''' current_path = os.path.dirname(os.path.abspath(__file__)) + "\\" trainingset_path = current_path + dataset_name + "\\" testset_path = current_path + testset_name + "\\" results_path = testset_path + "results\\" if not os.path.exists(results_path): os.mkdir(results_path) folder_names = next(os.walk(testset_path + "."))[1] if 'results' in folder_names: folder_names.remove('results') if new_emails: folder_names = [""] workfilename = 'mergedworkfile.csv' wordfilename = 'wordfile.csv' # klist = [1, 3, 7, 15, 24, 33, 42, 50] klist = [1, 3] acc = [] ks = [] trainingSet = [] print("Loading Training Set...") wordsd, subd, digramsd, trigramsd = ex.loadTrainingset( trainingset_path, workfilename, wordfilename, trainingSet) print("Training Set loaded.") print('Collecting ' + 'New' * new_emails + 'Test' * (not new_emails) + ' Emails...') testSet, all_files = ex.loadTestset(testset_path, folder_names, wordsd, subd, digramsd, trigramsd) print('New' * new_emails + 'Test' * (not new_emails) + ' Emails Collected.') assert (len(trainingSet[0]) == len(testSet[0])) list_of_predictions = knn.classify(klist, trainingSet, testSet, results_path) if not new_emails: #Finds the predictions and accuracy for new test mails given the predictions for these mails for i in range(len(klist)): predictions = [] for x in range(len(testSet)): predictions.append(list_of_predictions[x][i]) accuracy = knn.getAccuracy(testSet, predictions) acc.append(accuracy) ks.append(klist[i]) print('K: ' + repr(klist[i])) print('Accuracy: ' + repr(accuracy) + '%') print('Overall Accuracy: ' + str(sum(acc) / len(acc)) + "%") plt.plot(ks, acc) plt.xlabel('K') plt.ylabel('Accuracy') plt.show() print('Find the results at: ' + results_path)
def test_classify(self): point = (100, 110) # 未知样本 dataSet = [(1, 0), (1, 1), (-1, -2), (120, 100), (100, 100), (120, 130)] # 已知样本数据集 dataLabel = ['X', 'X', 'X', 'Y', 'Y', 'Y'] #已知样本的分类标签 res = knn.classify(point, dataSet, dataLabel, 3) assert res == "Y"
def classify_test_data(filename): # 对测试集合,单个文件进行识别 file_path = test_digits_path + filename try: data = img2vector(file_path) res = classify(data, train_dataset, train_labels, 3) return int(res) except FileNotFoundError: print("No such file.")
def classify_person(percent_game, fly_miles, ice_cream): k_value = 3 labels_str = ['not at all','in small doses', 'in large doses'] input_point = array([percent_game, fly_miles, ice_cream]) data_set, labels = get_data_set_from_file("dating.dataset") data_set, value_ranges, min_values = knn.auto_normalize(data_set) normalized_point = knn.normalize(input_point, value_ranges, min_values) result = knn.classify(normalized_point, data_set, labels, k_value) return labels_str[result-1]
def main(dataset_name, testset_name, new_emails = False): '''Runs the knn classifier for a training set dataset_name and test set testset_name''' current_path = os.path.dirname(os.path.abspath(__file__)) + "\\" trainingset_path = current_path + dataset_name + "\\" testset_path = current_path + testset_name + "\\" results_path = testset_path + "results\\" if not os.path.exists(results_path): os.mkdir(results_path) folder_names = next(os.walk(testset_path + "."))[1] if 'results' in folder_names: folder_names.remove('results') if new_emails: folder_names = [""] workfilename = 'mergedworkfile.csv' wordfilename = 'wordfile.csv' # klist = [1, 3, 7, 15, 24, 33, 42, 50] klist = [1, 3] acc = [] ks = [] trainingSet=[] print("Loading Training Set...") wordsd, subd, digramsd, trigramsd = ex.loadTrainingset(trainingset_path, workfilename, wordfilename, trainingSet) print("Training Set loaded.") print('Collecting ' + 'New'*new_emails + 'Test'*(not new_emails) + ' Emails...') testSet, all_files = ex.loadTestset(testset_path, folder_names, wordsd, subd, digramsd, trigramsd) print('New'*new_emails + 'Test'*(not new_emails) + ' Emails Collected.') assert(len(trainingSet[0]) == len(testSet[0])) list_of_predictions = knn.classify(klist, trainingSet, testSet, results_path) if not new_emails: #Finds the predictions and accuracy for new test mails given the predictions for these mails for i in range(len(klist)): predictions = [] for x in range(len(testSet)): predictions.append(list_of_predictions[x][i]) accuracy = knn.getAccuracy(testSet, predictions) acc.append(accuracy) ks.append(klist[i]) print('K: ' + repr(klist[i])) print('Accuracy: ' + repr(accuracy) + '%') print('Overall Accuracy: '+ str(sum(acc)/len(acc)) + "%") plt.plot(ks, acc) plt.xlabel('K') plt.ylabel('Accuracy') plt.show() print('Find the results at: ' + results_path)
def classify(p): # return list of feature vectores from image table training_data = knn.construct(p) kes = [3, 5, 7, 9] with open('p_files/test_table_' + seg + '_pc' + str(p) + '.p', 'rb') as f: test_table = pickle.load(f) max_count = len(test_table) for k in kes: start_time = time.time() print('seg: ' + seg + ' - k : ' + str(k)) i = 1 correct = 0 ROC = {} for im_struct in test_table: # classify new image from training data # get a sorted list of the class id and the number of votes label_candidates = knn.classify(training_data, im_struct['feature_vector'], k) # label_candidates[0][0] should give the classification label = str(int(label_candidates[0][0])) im_struct['label_candidates'] = label_candidates im_struct['prediction'] = True class_id = im_struct['class_id'] print(label, class_id) if class_id == label: correct += 1 ROC = update_ROC(ROC, class_id, tp=True, number=True) else: ROC = update_ROC(ROC, class_id, number=True) ROC = update_ROC(ROC, label, fp=True) im_struct['prediction'] = False i += 1 print('accuracy: ', correct/ max_count) print('correct: ', correct) with open('p_files/test_table_' + seg + '_pc' + str(p) + '_k' + str(k) + '.p', 'wb') as f: pickle.dump(test_table, f) with open('p_files/ROC_table_' + seg + '_pc' + str(p) + '_k' + str(k) + '.p', 'wb') as f: pickle.dump(ROC, f) toe = (time.time() - start_time) times.append(toe)
def flores_data_set_run(): data_set = arff.load(open('./data/flores.arff')) test_set, train_set = list_helper.separate_list(data_set['data'], 10) results = [] for x in range(len(test_set)): result = knn.classify(train_set, test_set[x][:-1]) results.append(result) print "Real: " + test_set[x][-1] + " - Predicted: " + result accuracy = knn.calculate_accuracy(test_set, results) print "Accuracy: " + str(accuracy) + "%"
def remove_noise(points, classes, metric, kernel, k): fds = folds(points, classes, len(points)) pts, cls = [], [] for fold in fds: cl = knn.classify(fold["train_p"], fold["train_c"], fold["test_p"][0], metric, kernel, k) if cl == fold["test_c"][0]: # not noise pts.append(fold["test_p"][0]) cls.append(cl) return np.array(pts), np.array(cls)
def test_knn_classify_digits(self): test_xs, test_ys = data.create_digit_matrix('digits/testDigits') train_xs, train_ys = data.create_digit_matrix('digits/trainingDigits') errors = 0 for index, test_row in enumerate(test_xs): predicted_y = knn.classify(test_row, train_xs, train_ys, 3) if predicted_y != test_ys[index]: errors += 1 print(errors) print(test_xs.shape[0]) self.failUnless(errors/test_xs.shape[0] < 0.85)
def datingClassTest(): ratio=0.1 k=3 features,labels= preparation.file2matrix('datingTestSet2.txt') features=utils.normalize(features) #plotter.scatter(features[:,0],features[:,1],labels) numTestVecs=int(ratio*features.shape[0]) errorCount = 0.0 for i in range(numTestVecs): classifierResult = knn.classify(features[i,:],features[numTestVecs:features.shape[0],:],labels[numTestVecs:features.shape[0]],k) print "feature: %s | yhat: %d | y: %d" % (features[i,:],classifierResult,labels[i]) if (classifierResult != labels[i]): errorCount+=1 print "Error rate = %f" % (errorCount/numTestVecs)
def test(): ''' run test script ''' dataset, labels = load_dataset() le = preprocessing.LabelEncoder() encoded_labels = le.fit_transform(labels) n_neighbor = classify(dataset, [[1000, 0.5, 340]], encoded_labels) print(le.inverse_transform(n_neighbor))
def classify_test_dataset(k): # 对测试集进行识别 test_num = len(test_dataset) # 测试集的数目 error_num = 0 # 错误数目 for data, label in zip(test_dataset, test_labels): res = classify(data, train_dataset, train_labels, k) # 对测试集进行预测 if res != label: error_num += 1 # 若预测错误,则计数器加一 print("total:{},error num:{},error rate:{}".format(test_num, error_num, error_num / test_num))
def prediccion_data_set_run(): data_set = arff.load(open('./data/prediccion.arff')) cleaned_data_set = [list_helper.unicode_to_int(x) for x in data_set['data']] test_set, train_set = list_helper.separate_list(cleaned_data_set, 1) results = [] for x in range(len(test_set)): result = knn.classify(train_set, test_set[x][:-1]) results.append(result) print "Real: " + str(test_set[x][-1]) + " - Predicted: " + str(result) accuracy = knn.calculate_accuracy(test_set, results) print "Accuracy: " + str(accuracy) + "%"
def classify(self, sudoku): feature_vector = self.feature.feature_vector(sudoku) level = feature_vector.pop(0) point = dict() point['features'] = feature_vector point = knn.classify(point, "train_results.txt", 5) level = int(point['level']) sgl = learning.StochasticGradientLearner(learning.basicFeatureExtractor) feature_vector.insert(0, level) sgl_level = sgl.predict_one(feature_vector) weighted_level = 2*level + 3*sgl_level weighted_level = weighted_level/5 if level == 1 or sgl_level == 1: return 1 return weighted_level
def classify_gui(k): """ :param k: k值 :return:人群分类 """ data_mat, class_label_vector = file_to_matrix('../data/dating_test_set_2.txt') fly_distances = float(input("请输入飞行里程数:")) icecream = float(input("请输入消耗冰淇淋公升数:")) play_time = float(input("请输入玩游戏花费时间百分比:")) norm_data_set, ranges, min_vals = auto_norm(data_mat) data_person=np.array([fly_distances,icecream,play_time]) norm_person_data=(data_person-min_vals)/ranges class_person=['不喜欢','一般','极具魅力'] label_person=classify(norm_person_data,norm_data_set,class_label_vector,k) return class_person[label_person-1]
def datingClassTest(): ratio = 0.1 k = 3 features, labels = preparation.file2matrix('datingTestSet2.txt') features = utils.normalize(features) #plotter.scatter(features[:,0],features[:,1],labels) numTestVecs = int(ratio * features.shape[0]) errorCount = 0.0 for i in range(numTestVecs): classifierResult = knn.classify( features[i, :], features[numTestVecs:features.shape[0], :], labels[numTestVecs:features.shape[0]], k) print "feature: %s | yhat: %d | y: %d" % (features[i, :], classifierResult, labels[i]) if (classifierResult != labels[i]): errorCount += 1 print "Error rate = %f" % (errorCount / numTestVecs)
def classify_person(): """ 对给定的数据进行人群分类判断 :return: """ #定义人群分类:[0,1,2] ff_miles=float(input("每年飞行常客里程数:")) ice_cream=float(input('每周消耗的冰淇淋公升数:')) percent_game=float(input('玩游戏所消耗的时间百分比:')) data_mat,class_label_vector=file_to_matrix('../data/dating_test_set_2.txt') norm_dating_data_set,ranges,min_vals=auto_norm(data_mat) in_x=np.array([ff_miles,ice_cream,percent_game]) #待验证数据 norm_in_x=(in_x-min_vals)/(ranges) classify_result=classify(norm_in_x,norm_dating_data_set,class_label_vector,3) return classify_result
def test_knn_classify_dating(self): xs, ys = data.create_dating_set() xs, mins, ranges = knn.normalize_cols(xs) n = xs.shape[0] test_n = int(n * 0.1) errors = 0 for i in range(test_n): predicted_y = knn.classify(xs[i], xs[test_n:n, :], ys[test_n:n], 5) if predicted_y != ys[i]: errors += 1 print(errors) print(test_n) self.failUnless(errors/float(test_n) < 0.85)
def dating_class_test(): """ 应用测试集测试分类机的错误率 :return: None """ hold_out_ratio = 0.10 #拿出作为测试集的数据比例 data_set,labels = file_to_matrix('./data/dating_test_set_2.txt') norm_data_set,ranges,min_vals=auto_norm(data_set) size=norm_data_set.shape[0] #获得数据集行数 num_test_size = int(size * hold_out_ratio) #保留行数 error_count = 0.0 #错误统计 for i in range(num_test_size): classifier_result=classify(norm_data_set[i,:],norm_data_set[num_test_size:size],labels[num_test_size:size],5) print('分类器返回:%d, 真实答案为:%d'% (classifier_result,labels[i])) if classifier_result!= labels[i]: error_count+=1.0 print('分类器错误率为:%0.2f%%' % (error_count/(float(num_test_size))*100))
def run(): classificadores = ["Bayesian", "KNN 1", "KNN 5", "KNN 10", "KNN 20", "KNN 30", "Sum 1", "Sum 5", "Sum 10", "Sum 20", "Sum 30"] errorResults = {} for c in classificadores: errorResults[c] = [] K = [30, 20, 10, 5, 1] resultsBay = open('part2-results-bayesian.txt', 'a') resultsKn = open('part2-results-knn.txt', 'a') resultsSum = open('part2-results-sum.txt', 'a') for j in range(10): H = data_proccessing.loadData() # Folds for i in range(10): resultsBay.write("Round %d\n\n" % (j*10+i+1)) resultsKn.write("Round %d\n\n" % (j*10+i+1)) resultsSum.write("Round %d\n\n" % (j*10+i+1)) (trainX, trainY, testX, testY) = prepareSets(H, i) (P_bay, E_bay, e_rate_bay, se_bay, interval_bay) = bayesian.classify(trainX, trainY, testX, testY) resultsBay.write("- Bayesian\n") writeResults(resultsBay, e_rate_bay, se_bay, interval_bay) errorResults["Bayesian"].append(e_rate_bay) for k in K: (P_kn, E_kn, e_rate_kn, se_kn, interval_kn) = knn.classify(trainX, trainY, testX, testY, k) (P_sum, E_sum, e_rate_sum, se_sum, interval_sum) = sum_rule.classify([P_bay, P_kn], testX, testY) # resultsKn.write("- KNN (n = %d)\n" % k) writeResults(resultsKn, e_rate_kn, se_kn, interval_kn) errorResults["KNN %i" % k].append(e_rate_kn) # resultsSum.write("- Sum (n = %d)\n" % k) writeResults(resultsSum, e_rate_sum, se_sum, interval_sum) errorResults["Sum %i" % k].append(e_rate_sum) resultsBay.close() resultsKn.close() resultsSum.close() compare(errorResults)
def test(): ''' knn test of dating data ''' # import dataset, normalized dataset and class labels for dataset dset, normalizing, labeling = process_data('datingTestSet.txt') # normalized dataset, ranges, minimum values # and maximum values from dataset norm_dset, ranges, min_vals, max_vals = normalizing # label indices to match labels for sample in dataset # against class labels key and class labels key label_indices, labels_key = labeling # use 10 percent of training data as test data ho_ratio = 0.10 # m is number of samples in dataset m = norm_dset.shape[0] # number of test samples num_tests = int(m * ho_ratio) # loop over all test samples and compare known labels versus alogrithm # classification and print out error rate error_count = 0.0 for i in range(num_tests): # normalize test sample norm_test = (dset[i, :] - min_vals) / ranges # classify test sample classification = classify(norm_test, norm_dset[num_tests:m, :], label_indices[num_tests:], 3) print('classifier answer: {}, real answer: {}'.format( labels_key[classification], labels_key[label_indices[i]])) # compare known label to classifier label if labels_key[classification] != labels_key[label_indices[i]]: error_count += 1.0 print('total error rate: {}'.format(error_count / float(num_tests)))
def classify_number_test(data_set_path, test_data_path, k, ratio=1): """ 应用knn识别数字,并计算正确率 :param data_set_path: 数据集 :param test_data_path: 测试集 :param k: k值 :return: None """ start = time.clock() #设置时间点 data_set, lables = file2Matrix(data_set_path, ratio) #读取训练数据集 data_set_test, lables_test = file2Matrix(test_data_path) #读取测试数据集 true_count = 0 for i in range(data_set_test.shape[0]): result_lable = classify(data_set_test[i], data_set, lables, k) #调用kNN分类器返回分类结果 # print('分类器返回结果为:%d,真实结果为:%d' % (result_lable, lables_test[i])) if result_lable == lables_test[i]: true_count = true_count + 1 end = time.clock() print('k值为:%d,正确率为:%0.2f%%,耗时:%0.6f' % (k, (true_count / data_set_test.shape[0] * 100), (end - start)))
def hand_writing_class_test(): """ 测试识别手写数字分类正确率 :return: None """ #第一步,创建训练集内容 hw_labels = [] training_file_list = listdir('../data/digits/training_digits') #获取目录内容 m = len(training_file_list) training_data_set = np.zeros((m, 1024)) #利用行数创建训练集集合 for i in range(m): filename_str = training_file_list[i] # file_str = filename_str.split('.')[0] #将文件名截取 # class_num_str = int(file_str.split('_')[0]) class_num_str = int(filename_str.split('_')[0]) hw_labels.append(class_num_str) #将分类添加至分类向量 img_vector = img_to_vector('../data/digits/training_digits/%s' % filename_str) training_data_set[i, :] = img_vector # print(training_data_set[i]) #第二步,创建测试集 test_file_list = listdir('../data/digits/test_digits') m_test = len(test_file_list) error_count = 0.0 for i in range(m_test): filename_str = test_file_list[i] class_num_str = int(filename_str.split('_')[0]) # print(class_num_str) test_img_vector = img_to_vector('../data/digits/test_digits/%s' % filename_str) # print((training_data_set[i]-test_img_vector).sum()) # print(class_num_str) #创建一个5NN分类模型 classifier_result = classify(test_img_vector, training_data_set, hw_labels, 5) print(classifier_result)
def cross_validate(train_data, train_labels, k, distance, F=5, prints=True): """ Performs f-fold cross validation on the specified training set. Returns an array storing all cross-validation accuracies """ # number of training instances in each cross-validation subset C = train_data.shape[0] // F # initialize empty array to store cross-validation accuracies accuracy = np.zeros(F) # for each round of cross-validation for f in range(F): # create indices for the validation set validation_index = np.arange(f * C, (f + 1) * C) # create indeices for the training set train_index = np.setdiff1d(np.arange(0, train_data.shape[0]), validation_index) # obtain predicted labels for the images in the validation set predicted_labels = knn.classify(train_data[train_index], train_labels[train_index], train_data[validation_index], k, distance) # compute confusion matrix for validation set con_matrix = knn.confusion_matrix(train_labels[validation_index], predicted_labels) # convert to pandas data frame to label rows and columns, then print # suppressed when performing cross-validation for multiple values of k if prints: con_mat_df = pd.DataFrame(con_matrix, index=['1', '2', '7'], columns=['1', '2', '7']) print('Cross-validation round', f + 1) print( 'Confusion matrix: Predicted classes along horizontal axis. Actual classes along vertical axis.' ) print(con_mat_df) # compute and store cross-validation accuracy accuracy[f] = knn.accuracy(con_matrix) return accuracy
def test(): ''' run test script ''' train_dir = getcwd() + '/datasets/hw/trainingDigits/' train_dataset, train_filenames = build_dataset(train_dir) train_labels = labels_from_filenames(train_filenames) train_vectors = to_vectors(train_dataset) test_dir = getcwd() + '/datasets/hw/testDigits/' test_dataset, test_filenames = build_dataset(test_dir) test_labels = labels_from_filenames(test_filenames) test_vectors = to_vectors(test_dataset) n_neighbors = classify(train_vectors, test_vectors, train_labels) err_count = 0 for idx, prediction in enumerate(n_neighbors): if prediction != test_labels[idx]: err_count += 1 print('error rate: {}'.format(err_count / float(len(test_vectors))))
def test(): ''' get user input for test dating data ''' dset, normalizing, labeling = process_data('datingTestSet.txt') norm_set, ranges, min_vals, max_vals = normalizing label_indices, label_keys = labeling # classes key class_names = ['not at all', 'in small doses', 'in large doses'] # user input -- sample for testing gaming = float(input('percent time playing video games?')) flyerMiles = float(input('frequent flyer miles earned each year?')) iceCream = float(input('liters of ice cream consumed per year?')) user_input = array([flyerMiles, gaming, iceCream]) # normalize test sample norm_test = (user_input - min_vals) / ranges # classify user input classifications = classify(norm_test, norm_set, label_indices, 3) print('You will probably like this person: ', class_names[classifications - 1])
def main(): technique = sys.argv[3] if (technique in ("nearest", "best")): if technique == "best": print "We are getting best accuracy for KNN, and adaboost is closely behind it..." k = 201 print "inside KNN , this takes upto to 15 minutes to show the output" train = sys.argv[1] test = sys.argv[2] # (train, test, technique) = ("train-data-mod.txt", "test-data-mod.txt", "nearest") '''KNN is simple but complexity is the problem - if number of sample data is huge then the algorithm has a huge complexity''' result = knn.classify(train, test, k) confusionMatrix(result) print "knn_output.txt is created. You can see our predictions there" if (technique == "nnet"): input_data, class_labels, names = read_data(sys.argv[1]) cd = {'0': 0, '90': 1, '180': 2, '270': 3} weights_one, weights_two = nnets.train(input_data, class_labels, int(sys.argv[-1])) output_data, oclass_labels, test_names = read_data(sys.argv[2]) pred = feed_forward(input_data, weights_one, weights_two) print "Train Accuracy is", accuracy(pred, class_labels) test_pred = feed_forward(output_data, weights_one, weights_two) print "Test Accuracy is", accuracy(test_pred, oclass_labels) cf_list = write_to_file(test_pred, oclass_labels, test_names) confusionMatrix(cf_list) print "nnet_output.txt is created. You can see our predictions there" #print cf_list[:10] # print time.time()-start if technique == "adaboost": print "hii... this would take around 1 minute to run" adaboost.main(sys.argv[1], sys.argv[2], int(sys.argv[4])) print "adaboost_output.txt is created. You can see our predictions there"
def test_action_recognition(self): actual = knn.classify((0, 155), self.training_set) self.assertEqual("action", actual)
def test_romance_recognition(self): actual = knn.classify((180, 8), self.training_set) self.assertEqual("romance", actual)
def recog(vec,ts): return knn.classify(vec, ts.mat, ts.labels, 3)
def multiPack(mix_args): return classify(mix_args[0], mix_args[1], mix_args[2], mix_args[3]) == mix_args[4]
from knn import classify from utils import load import numpy as np from tqdm import tqdm import datetime train_images, train_labels, test_images, test_labels = load() # 调整训练集大小 train_images, train_labels = train_images[:10000], train_labels[:10000] # 取前100快一点! test_images, test_labels = test_images[:100], test_labels[:100] k = 5 print("读取完成") test_size = test_images.shape[0] print("正在分类:{}".format(test_size)) timeA = datetime.datetime.now() outY = [ classify(test_images[i], train_images, train_labels, k) for i in tqdm(range(test_size)) ] print(datetime.datetime.now() - timeA) # print("KNN输出:{}".format(outY)) # print("实际标签:{}".format(test_labels)) acc = np.mean(outY == test_labels) print('\n\n') print("正确率:{}".format(acc)) # 正确率:0.9683
#!/user/bin/env python #-*- coding:utf-8 -*- import knn from numpy import * dataset, labels = knn.creatDataSet() input = array([1.1, 0.3]) K = 3 output = knn.classify(input, dataset, labels, K) print("测试数据为:", input, "分类结果为:", output)
#!/usr/bin/python #coding = utf-8 from numpy import * import importData import plot import knn dataSet_tra, label_tra = importData.dataFrmFile('./optdigits.tra') dataSet_tes, label_tes = importData.dataFrmFile('./optdigits.tes') index = -1 cnt = 0 for i in label_tes: index += 1 if i != knn.classify(dataSet_tes[index], dataSet_tra, label_tra, 10): cnt += 1 print cnt plot.plot(dataSet_tes, label_tes, 4, 5)
import knn from numpy import * dataset, labels = knn.createDataset() print(dataset) print(labels) predict = array([[1,1.1]]) result = knn.classify(predict, dataset, labels, 3) print (result)