def train(): '''对knn进行训练''' datingDataMat, datingLables = knn.file2matrix('datingTestSet2.txt') normMat, rangeVals, minVals = knn.autoNorm(datingDataMat) print normMat print datingLables # knn.plotData(datingDataMat, datingLables) # 用作交叉验证集的数量百分比 hoRatio = 0.10 # 数据集的总数量 m = normMat.shape[0] # 测试集 numTestVecs = int(m * hoRatio) errorCount = 0.0 for i in range(numTestVecs): classifierResult = knn.classify0(normMat[i, :], normMat[numTestVecs:m, :], datingLables[numTestVecs:m], 3) print '分类器返回: %d, 实际的结果是:%d' % (classifierResult, datingLables[i]) if classifierResult != datingLables[i]: errorCount += 1.0 print '错误率是: %f' % (errorCount / (float(numTestVecs)))
def digit_class_test(): '''test the accuracy of the classifier''' digits_labels = [] train_list = listdir('trainingDigits') number_samples = len(train_list) #get training matrix and training labels train_matrix = zeros((number_samples, 1024)) for i in range(number_samples): filename_str = train_list[i] file_str = filename_str.split('.')[0] class_number_str = int(file_str.split('_')[0]) digits_labels.append(class_number_str) train_matrix[i, :] = image2vector('trainingDigits/%s' % filename_str) test_list = listdir('testDigits') error_count = 0 number_test = len(test_list) for i in range(number_test): filename_str = test_list[i] file_str = filename_str.split('.')[0] class_number_str = int(file_str.split('_')[0]) vector_test = image2vector('testDigits/%s' % filename_str) classifier_result = knn.classify0(vector_test, train_matrix, digits_labels, 3) if(classifier_result != class_number_str): error_count += 1 error_rate = float(error_count) / float(number_test) print("Error rate is: " + str(error_rate))
def handwritingClassTest(k): hwLabels = [] trainingFileList = os.listdir('trainingDigits') m = len(trainingFileList) trainingMat = zeros((m, 1024)) for i in range(m): fileNameStr = trainingFileList[i] classNumStr = int(fileNameStr.split('_')[0]) hwLabels.append(classNumStr) trainingMat[i, :] = img2vector('trainingDigits/%s' % fileNameStr) testFileList = os.listdir('testDigits') errorcount = 0.0 mTest = len(testFileList) for j in range(mTest): fileNameStr = testFileList[j] classNumStr = int(fileNameStr.split('_')[0]) vectorUnderTest = img2vector('testDigits/%s' % fileNameStr) classifierResult = knn.classify0(vectorUnderTest, trainingMat, hwLabels, k) #print('the classifier came back with: %d, the real number is %d' % (classifierResult, classNumStr)) if classifierResult != classNumStr: errorcount += 1.0 #print('\nthe total number of errors is: %s' % int(errorcount)) #print('\nthe total error rate is: %s' % float(errorcount/mTest)) return float(errorcount / mTest)
def train(trainImagePath, testImagePath): hwLabels = [] trainingFileList = os.listdir(trainImagePath) m = len(trainingFileList) trainningMat = np.zeros((m, 1024)) for i in range(m): fileNameStr = trainingFileList[i] fileStr = fileNameStr.split('.')[0] classNumStr = int(fileStr.split('_')[0]) hwLabels.append(classNumStr) trainningMat[i, :] = imageTool.img2vector('%s/%s' % (trainImagePath,fileNameStr)) testFileList = os.listdir(testImagePath) errorCount = 0.0 mTest = len(testFileList) for i in range(mTest): fileNameStr = testFileList[i] fileStr = fileNameStr.split('.')[0] classNumStr = int(fileStr.split('_')[0]) vectorUnderTest = imageTool.img2vector('%s/%s' % (testImagePath,fileNameStr)) classifierResult = knn.classify0(vectorUnderTest, trainningMat, hwLabels, 3) print '分类器返回的数字是:%d, 实际的数字是:%d' %(classifierResult, classNumStr) if (classifierResult != classNumStr): errorCount += 1. print '总的错误数: %d' % errorCount print '错误率: %f' % (errorCount / float(mTest))
def handwritingClassTest(): hwLabels = [] traingingFileList = os.listdir('trainingDigits') m = len(traingingFileList) trainingMat = np.zeros((m, 1024)) for i in range(m): fileNameStr = traingingFileList[i] fileStr = fileNameStr.split('.')[0] classNumStr = int(fileStr.split('_')[0]) hwLabels.append(classNumStr) trainingMat[i, :] = img2vect('trainingDigits/%s' % fileNameStr) testFileList = os.listdir("testDigits") errCount = 0.0 mTest = len(testFileList) for i in range(mTest): fileNameStr = testFileList[i] fileStr = fileNameStr.split('.')[0] classNumStr = int(fileNameStr.split('_')[0]) vectorUnderTest = img2vect('testDigits/%s' % fileNameStr) classifierResult = knn.classify0(vectorUnderTest, trainingMat, hwLabels, 3) if classifierResult != classNumStr: errCount = errCount + 1 print("calc is %d, real answer is %s" % (classifierResult, classNumStr)) print("error rate is %f", errCount / float(mTest)) print("error num is %d" % errCount)
def handwritingClassTest(): hwLabels = [] trainingFileSet = os.listdir('trainingDigits') m = len(trainingFileSet) trainingMat = zeros((m,1024)) for i in range(m): fileNameStr = trainingFileSet[i] fileStr = fileNameStr.split('.')[0] classNumStr = int(fileStr.split('_')[0]) hwLabels.append(classNumStr) trainingMat[i,] = img2vector('trainingDigits/%s'%fileNameStr) testFileList = os.listdir('testDigits') errorCount =0.0 mTest = len(testFileList[i]) fileStr = fileNameStr.split('.')[0] classNumStr = fileNameStr.split('_')[0] vectorUnderTest = img2vector('testDigits/%s'%fileNameStr) classifierResult = classify0(vectorUnderTest,trainingMat,hwLabels,3) print "the classifier came back with:%d, the real answer is:%d"%(classifierResult,classNumStr) if(classifierResult!=classNumStr): errorCount+=1.0 print "\nThe total number of error is:%d"%errorCount print "\nthe Total error rate is:%f"(errorCount/float(mTest))
def test_main_hand_writing(self): labels = [] train_files = os.listdir('digits/train') train_size = len(train_files) train_matrix = numpy.zeros((train_size, 1024)) for i in range(train_size): file_name = train_files[i] # 0_22.txt labels.append(get_label(file_name)) train_matrix[i,:] = knn.read_vector('digits/train/%s' % file_name) test_files = os.listdir('digits/test') test_size = len(test_files) err_count = 0.0 for i in range(test_size): file_name = test_files[i] # 0_22.txt real_label = get_label(file_name) test_vector = knn.read_vector('digits/test/%s' % file_name) classifier_result = knn.classify0(test_vector, train_matrix, labels, 3) print "predict: %s real: %s" % (classifier_result, real_label) if classifier_result != real_label: err_count += 1.0 err_rate = err_count / float(test_size) print 'total: %d error: %d rate: %f' % (test_size, err_count, err_rate)
def choose_action_by_knn(db, errorType, k): time = data_select(db, actionTriggerResTb, whatSelect='min(BuildTime)')[0][0] # in inX choose success and min time action for the error inX = array([errorType, 1, time]) res = data_select(db, actionTriggerResTb) dataSet, labels = knn.matrix_from_mysql(res) actionID = knn.classify0(inX, dataSet, labels, k) return actionID
def classfiy_person() : result_list = ['not at all','in small doses','in large doses'] percent_tats= float(input("percentage of time spent playing video games ?")) ffmiles = float(input("frequent flier miles earned per year ?")) ice_cream = float(input("liter of ice cream consumed per year ?")) dating_mat, dating_labels = knn.file2matrix('./knn/datingTestSet2.txt') normal_mat, ranges, min_values = knn.auto_normal(dating_mat) inArr = array([ffmiles, percent_tats, ice_cream]) classfiy_result = knn.classify0((inArr - min_values) / ranges, normal_mat, dating_labels, 3) print("You will probably like this person: ", result_list[classfiy_result - 1], "(" + str(classfiy_result) + ")")
def begin(): resultList = ['not at all', 'a litte like', 'like very much'] a = float(raw_input("percent")) b = float(raw_input("miles")) c = float(raw_input("liter")) inX = array([a, b, c]) datingDataMat, labels = file2matrix("./datingTestSet.txt") normData, range, min = autoNorm(datingDataMat) classRes = knn.classify0((inX - min) / range, normData, labels, 3) return resultList[classRes - 1]
def begin(): resultList = ['not at all', 'a litte like', 'like very much'] a = float(raw_input("percent")) b = float(raw_input("miles")) c = float(raw_input("liter")) inX = array([a, b, c]) datingDataMat, labels = file2matrix("./datingTestSet.txt") normData, range, min = autoNorm(datingDataMat) classRes = knn.classify0((inX-min)/range, normData, labels, 3) return resultList[classRes-1]
def classifiy_moives() : # 四组二维特征 group = np.array([[1, 101], [5, 89], [108, 5], [115, 8]]) # 四组特征的标签 labels = ['爱情片', '爱情片', '动作片', '动作片'] # 测试集 test = [101, 20] # kNN分类 test_class = knn.classify0(test, group, labels, 3) # 打印分类结果 print(test_class)
def classifiPerson(): resultList = [u'完全不会', u'小概率', u'大概率'] percentTats = float(raw_input("percent of time spent playing video game?")) ffMiles = float(raw_input("frequent flier miles earned per year?")) iceCream = float(raw_input("liters of ice cream consumed per year?")) datingDataMat, datingLabels = file2matrix('datingTestSet2.txt') normMat, ranges, minVals = autoNorm(datingDataMat) inArr = array([percentTats, ffMiles, iceCream]) classifierResult = knn.classify0([inArr - minVals] / ranges, normMat, datingLabels, 3) print "you will propabably like this persion: ", resultList[ classifierResult - 1]
def test() : group, labels = knn.create_dataset() print(group) print(labels) sort = knn.classify0([0,0],group,labels,3) print("distance is %s !"%(sort)) ## pycharm 中的相对路径不一样,需要在 Run -> Edit Configurations 中查看 Working dorectory #dating_mat, dating_labels = knn.file2matrix('./knn/datingTestSet.txt') dating_mat, dating_labels = knn.file2matrix('./knn/datingTestSet2.txt') print(dating_mat) print(dating_labels) knn.show_plt(dating_mat,dating_labels)
def datingClassTest(): rating = 0.1 datingDataMat, labels = file2matrix("./datingTestSet.txt") normDataSet, ranges, minVals = autoNorm(datingDataMat) m = normDataSet.shape[0] testNum = int(m * rating) err = 0.0 for i in range(testNum): index = knn.classify0(normDataSet[i, :], normDataSet, labels, 3) print "predict:"+str(index)+" real:"+str(labels[i]) if (index != labels[i]): err += 1.0 print str(err)+":"+str(m)
def datingClassTest(): rating = 0.1 datingDataMat, labels = file2matrix("./datingTestSet.txt") normDataSet, ranges, minVals = autoNorm(datingDataMat) m = normDataSet.shape[0] testNum = int(m * rating) err = 0.0 for i in range(testNum): index = knn.classify0(normDataSet[i, :], normDataSet, labels, 3) print "predict:" + str(index) + " real:" + str(labels[i]) if (index != labels[i]): err += 1.0 print str(err) + ":" + str(m)
def main(): print 'Main Begin******************' group, labels = knn.createDataSet() print group, '\n', labels predict = [1, 0.9] label = knn.classify0(predict, group, labels, 3) print predict, ' lable is: ', label cp.predict() print 'Main End********************'
def datingClassTest(): hoRatio = .1 datingDataMat, datingLabels = file2matrix('datingTestSet2.txt') normMat, ranges, minVals = autoNorm(datingDataMat) m = normMat.shape[0] # 值为1000行 numTestVecs = int(m * hoRatio) #取前100行为测试数据 errorCount = 0.0 for i in range(numTestVecs): classifierResult = knn.classify0(normMat[i, :], normMat[numTestVecs:m, :], datingLabels[numTestVecs:m], 3) print "line is %d,classifier came back with: %d,the real answer is %d" % ( i, classifierResult, datingLabels[i]) if (classifierResult != datingLabels[i]): errorCount += 1. print "total error rate is : %f" % (errorCount / float(numTestVecs))
def predict(): resultList = ['一点也不喜欢', '有点喜欢', '非常喜欢'] percentTats = float(raw_input('玩游戏的时间是: ')) ffMiles = float(raw_input('每年的飞行公里数: ')) iceCream = float(raw_input('每年消耗的冰淇淋: ')) datingDataMat, datingLabels = knn.file2matrix('datingTestSet2.txt') normMat, ranges, minVals = knn.autoNorm(datingDataMat) inArr = np.array([ffMiles, percentTats, iceCream]) # 对输入数据的正规化处理 inArrNorm = (inArr - minVals) / ranges classifierResult = knn.classify0(inArrNorm, normMat, datingLabels, 3) print '预测你可能喜欢这个人的程度:', resultList[classifierResult - 1]
def predict(): resultList = ['一点也不喜欢', '有点喜欢', '非常喜欢'] percentTats = float(raw_input('玩游戏的时间是: ')) ffMiles = float(raw_input('每年的飞行公里数: ')) iceCream = float(raw_input('每年消耗的冰淇淋: ')) datingDataMat, datingLabels = knn.file2matrix('datingTestSet2.txt') normMat, ranges, minVals = knn.autoNorm(datingDataMat) inArr = np.array([ffMiles, percentTats, iceCream]) # 对输入数据的正规化处理 inArrNorm = (inArr - minVals) / ranges classifierResult = knn.classify0(inArrNorm, normMat, datingLabels, 3) print '预测你可能喜欢这个人的程度:', resultList[classifierResult-1]
def test_main_dating(self): test_ratio = 0.50 dataset_matrix, labels = knn.read_matrix('dating/dataset.txt') norm_matrix, ranges, min_value = knn.auto_norm(dataset_matrix) size = norm_matrix.shape[0] test_num = int(size * test_ratio) err_count = 0.0 for i in range(test_num): classifier_result = knn.classify0(norm_matrix[i, :], norm_matrix[test_num:size,:], labels[test_num:size], 3) print "predict: %d real: %d" % (classifier_result, labels[i]) if classifier_result != labels[i]: err_count += 1.0 err_rate = err_count / float(test_num) print 'total: %d error: %d rate: %f' % (test_num, err_count, err_rate)
def datingClassTest(): hoRatio = 0.10 datingDataMat, datingLabels = knn.file2matix( '/home/matija/Projects/personal_projects/show-me-the-code/data-science/CollectiveIntelligence/dataSets/datingTestSet1.txt' ) normMat, ranges, minVals = knn.autoNorm(datingDataMat) m = normMat.shape[0] #vectors to test knn clf numTestVecs = int(m * hoRatio) errorCount = 0.0 for i in range(numTestVecs): classifierResult = knn.classify0(normMat[i,:], normMat[numTestVecs:m,:]\ ,datingLabels, 3) print "the classifier came back with: %d, the real answer is: %d"\ %(classifierResult, datingLabels[i]) if (classifierResult != datingLabels[i]): errorCount += 1.0 print "the total error rate is: %f" % (errorCount / float(numTestVecs))
def count(disc_set, W_LDA, train_final, label, test_NUM_IN): #####准确率统计代码LDA####### # newImg = loadImageSet_many() print 'test_num is ' + str(test_NUM_IN) new = loadImageSet_many(test_NUM_IN) # newImg = LBP.LBP(rows,cols,new.T).T newImg = new newImg_pro = disc_set.T * newImg.T newImg_final = W_LDA.T * newImg_pro i = 0 suM = 0 while (i < new_test_tol): Class = knn.classify0(newImg_final.T[i], train_final.T, label.T, 7) # print Class if (Class == test_NUM_IN): suM = suM + 1 i = i + 1 statistic = float(suM) / new_test_tol print 'test_NUM_IN=' + str(test_NUM_IN) + ' ' + str(statistic) return statistic
def error_prediction_by_knn(db, k): try: job_id_for_predict = data_select(db, jobNameTb, 'JobName', conf.jobName, 'id')[0][0] except: job_id_for_predict = data_select(db, jobNameTb, whatSelect='max(id)')[0][0] + 1 inX = array([job_id_for_predict, conf.recentSuccessRate, conf.buildTime], dtype='float64') res = data_select(db, errorFeatureTb) dataSet, labels = knn.matrix_from_mysql(res) if len(dataSet) < k: print "There is not enough data to predict." return knn.plot_save(dataSet, labels, 'dataPlot.png', show=True) error_predict = knn.classify0(inX, dataSet, labels, k) print "I predict the error type is", data_select(db, errorTypeTb, 'id', error_predict, 'ErrorString')[0][0] print "The Error collected is", conf.error
def mnist_test(numTrain = 200, numTest = 100, k =5): """使用手写数据测试数据""" mnistData = load_data() training_inputs = mnistData['training_inputs'][:numTrain] training_labels = mnistData['training_labels'][:numTrain] test_inputs = mnistData['test_inputs'][:numTest] test_labels = mnistData['test_labels'][:numTest] n = test_inputs.shape[0] numError = 0 for i in xrange(n): result = knn.classify0(test_inputs[i], dataSet=training_inputs, labels=training_labels, k = k) if result != test_labels[i]: numError += 1 testError = numError / float(n) print "测试数据错误率为 %f" %testError
def handwritingClassTest(): hwLabels = [] trainListFiles = os.listdir('trainingDigits') m = len(trainListFiles) trainMat = zeros([m, 1024]) for i in range(1, m): fileNameStr = trainListFiles[i] classNum = fileNameStr.split('.')[0].split('_')[0] hwLabels.append(classNum) trainMat[i-1,:] = img2vector('./trainingDigits/'+fileNameStr) testFileList = os.listdir('./testDigits') errorcount = 0.0 mTest = len(testFileList) for i in range(1, mTest): fileNameStr = testFileList[i] classNum = fileNameStr.split('.')[0].split('_')[0] vectorForTest = img2vector('./testDigits/'+fileNameStr) classResult = knn.classify0(vectorForTest, trainMat, hwLabels, 3) print "result : "+classResult+" true : "+classNum if (classNum != classResult): errorcount += 1.0; print "all is : "+str(errorcount/mTest)
def datingClassTest(): hoRatio = 0.1 # 测试范围,一部分测试一部分作为样本 datingDataMat, datingLabels = file2matrix( "datingTestSet2.txt") # load data setfrom file normMat, ranges, minVals = autoNorm(datingDataMat) m = normMat.shape[0] numTestVecs = int(m * hoRatio) print('numTestVecs=', numTestVecs) errorCount = 0 for i in range(numTestVecs): classifierResult = classify0(normMat[i], normMat[numTestVecs:m], datingLabels[numTestVecs:m], 3) print("the classifier came back with: %d, the real answer is: %d" % (classifierResult, datingLabels[i])) errorCount += classifierResult != datingLabels[i] print("the total error rate is: %f" % (errorCount / numTestVecs)) print(errorCount) fig = plt.figure() ax = fig.add_subplot(111) ax.scatter(datingDataMat[:, 0], datingDataMat[:, 1], 15.0 * np.array(datingLabels), 15.0 * np.array(datingLabels)) plt.show()
def train(trainImagePath, testImagePath): hwLabels = [] trainingFileList = os.listdir(trainImagePath) m = len(trainingFileList) trainningMat = np.zeros((m, 1024)) for i in range(m): fileNameStr = trainingFileList[i] fileStr = fileNameStr.split('.')[0] classNumStr = int(fileStr.split('_')[0]) hwLabels.append(classNumStr) trainningMat[i, :] = imageTool.img2vector( '%s/%s' % (trainImagePath, fileNameStr)) testFileList = os.listdir(testImagePath) errorCount = 0.0 mTest = len(testFileList) for i in range(mTest): fileNameStr = testFileList[i] fileStr = fileNameStr.split('.')[0] classNumStr = int(fileStr.split('_')[0]) vectorUnderTest = imageTool.img2vector('%s/%s' % (testImagePath, fileNameStr)) classifierResult = knn.classify0(vectorUnderTest, trainningMat, hwLabels, 3) print '分类器返回的数字是:%d, 实际的数字是:%d' % (classifierResult, classNumStr) if (classifierResult != classNumStr): errorCount += 1. print '总的错误数: %d' % errorCount print '错误率: %f' % (errorCount / float(mTest))
def train(): '''对knn进行训练''' datingDataMat, datingLables = knn.file2matrix('datingTestSet2.txt') normMat, rangeVals, minVals = knn.autoNorm(datingDataMat) print normMat print datingLables # knn.plotData(datingDataMat, datingLables) # 用作交叉验证集的数量百分比 hoRatio = 0.10 # 数据集的总数量 m = normMat.shape[0] # 测试集 numTestVecs = int(m*hoRatio) errorCount = 0.0 for i in range(numTestVecs): classifierResult = knn.classify0(normMat[i, :], normMat[numTestVecs:m, :], datingLables[numTestVecs:m], 3) print '分类器返回: %d, 实际的结果是:%d' % (classifierResult, datingLables[i]) if classifierResult != datingLables[i]: errorCount += 1.0 print '错误率是: %f' % (errorCount / (float(numTestVecs)))
# -*- coding: UTF-8 -*- 或者 #coding=utf-8 ''' Created on 2016年8月20日 @author: xiaoyuan ''' import knn group,labels = knn.createDataSet() print knn.classify0([0,0], group, labels, 3)
# 归一化数据 normMat, ranges, minVals = knn.autoNorm(datingDataMat) print('norm mat:') print(normMat) print('range:') print(ranges) print('norm mat:') print(minVals) # 测试分类器,使用数据集前hoRatio比例做测试集 hoRatio = 0.10 m = normMat.shape[0] numTestVecs = int(m * hoRatio) errorCount = 0.0 for i in range(numTestVecs): classifierResult = knn.classify0(normMat[i, :], normMat[numTestVecs:m, :], datingLabels[numTestVecs:m], 3) print("the classifier came back with: %d, the real answer is: %d" % (classifierResult, datingLabels[i])) if (classifierResult != datingLabels[i]): errorCount += 1.0 print("the total error rate is: %f" % (errorCount / float(numTestVecs))) print(errorCount) # 预测分类 resultList = ['not at all', 'in small doses', 'in large doxes'] ffMiles = float(input('frequent flier miles earned per year?')) percentTats = float(input("percentage of time spent playing video games?")) iceCream = float(input('liters of ice cream consumed per year?')) inArr = array([ffMiles, percentTats, iceCream]) classifierResult = knn.classify0((inArr - minVals) / ranges, normMat, datingLabels, 3) print('You will probably like this person:', resultList[classifierResult - 1])
import knn; import operator; from numpy import *; #TESt creating datasets group, labels = knn.createdataset() ; print(group); print(labels); #Test classifier label = knn.classify0([5,0],group,labels,3); print(label); #test loading file datingDataMat,datingLabels = knn.filetoMatrix('C:\\Users\Folorunsho Solomon\\Documents\\GitHub\\ML\\python\\datingTestSet2.txt'); print(datingDataMat); print(datingLabels); import matplotlib; import matplotlib.pyplot as plt fig = plt.figure() ax = fig.add_subplot(111)
#计算类间差 temp1 = EachClassMean - total_mean Fai_b = sqrt(train_samplesize) * temp1 # print EachClassMean[:,0] # print EachClassMean[:,1] #计算类内差 for i in range(0, train_tol): Fai_w[:, i] = train_pro[:, i] - EachClassMean[:, int(label[0, i]) - 1] Sb = Fai_b * Fai_b.T Sw = Fai_w * Fai_w.T LDA_dim = ClassNum - 1 eig_val, eig_vec = linalg.eig(Sw.I * Sb) eigSortIndex = argsort(-eig_val) # 从大到小排序,默认从小到大,参数为负表示降序 W_LDA = mat(eig_vec[:, eigSortIndex[:LDA_dim]]) # 取LDA方向 #训练样本再次投影 train_final = W_LDA.T * train_pro #调用knn邻近分类器 newImg = cv2.imread('D:\PyCharm\PyCharmProjects\s1_5.bmp', 0) newImg = mat(newImg).flatten().T newImg_pro = disc_set.T * newImg newImg_final = W_LDA.T * newImg_pro Class = knn.classify0(newImg_final.T, train_final.T, label.T, 7) print Class
trainingMat = zeros((m, 1024)) for i in range(m): fileNameStr = trainingFileList[i] fileStr = fileNameStr.split('.')[0] # 找分类标签 classNumStr = int(fileStr.split('_')[0]) hwLabels.append(classNumStr) trainingMat[i, :] = img2vector('./digits/trainingDigits/%s' % fileNameStr) # 获取测试文件列表 testFileList = listdir('./digits/testDigits') errorCount = 0 mTest = len(testFileList) for i in range(mTest): fileNameStr = testFileList[i] fileStr = fileNameStr.split('.')[0] # 找分类标签 classNumStr = int(fileStr.split('_')[0]) vectorUnderTest = img2vector('./digits/testDigits/%s' % fileNameStr) # 计算分类结果 classfierResult = knn.classify0(vectorUnderTest, trainingMat, hwLabels, 3) print('file:' + fileStr + ' the classifier came back with:%d, the real answer is:%d' % (classfierResult, classNumStr)) if (classfierResult != classNumStr): errorCount += 1 print('the total number of errors is:%d' % errorCount) print('the total error rate is:%f' % (errorCount / float(mTest)))
# -*- coding: utf-8 -*- """ Created on Fri Sep 21 10:38:54 2018 @author: fsxn2 """ import knn import matplotlib import matplotlib.pyplot as plt #group,labels=knn.createDataSet() #print(knn.classify0([0,0],group,labels,3)) group, labels = knn.file2matrix("input.txt") auto, ranges, minval = knn.autoNorm(group) print(auto) print(ranges) print(minval) #fig=plt.figure() #ax=fig.add_subplot(111) #ax.scatter(group[:,1],group[:2]) #plt.show() print(knn.classify0([1, 0, 3], group, labels, 3))
vec = f.read() return list(vec.replace('\n', '')) sizeData = len(listData) #文件的数量 cLabel = zeros(sizeData, dtype='int16') #文件类别 arrTrain = zeros((sizeData, 1024), dtype='int16') #训练样本数组 for i, j in enumerate(listData): cLabel[i] = int(j[0]) #每个文件对应的类别 arrTrain[i, :] = file2arr(dirname + '\\' + j) return cLabel, arrTrain if __name__ == '__main__': from knn import classify0 fTrain = r'..\data\Ch02\digits\trainingDigits' fTest = r'..\data\Ch02\digits\testDigits' cLabel, arrTrain = loadData(fTrain) cLabelTest, arrTest = loadData(fTest) err = 0 for j, i in enumerate(arrTest): label = classify0(i, arrTrain, cLabel, 3) if cLabelTest[j] != label: err += 1 print('错误率:', err / len(cLabelTest)) #sklearn库knn对比 from sklearn.neighbors import KNeighborsClassifier as knn model = knn(n_neighbors=3, n_jobs=4, algorithm='auto') model.fit(arrTrain, cLabel) cLabelPredict = model.predict(arrTest) print('错误率', sum(cLabelPredict != cLabelTest) / len(cLabelTest))
#!/usr/bin/env python import knn # import matplotlib import matplotlib.pyplot as plt import numpy as np group, labels = knn.create_data_set() print("group=", group) print("labels=", labels) result = knn.classify0([0, 0], group, labels, 3) print("result=", result) datingDataMat, datingLabels = knn.file2matrix('datingTestSet.txt') print("datingDataMat=", datingDataMat) print("datingLabels=", datingLabels[:20]) ''' fig = plt.figure() ax = fig.add_subplot(111) print("datingDataMat[:, 1]=", datingDataMat[:, 1][0]) print("datingDataMat[:, 2]=", datingDataMat[:, 2][0]) ax.scatter(datingDataMat[:, 1], datingDataMat[:, 2]) # ax.scatter(datingDataMat[:, 1], datingDataMat[:, 2], 15.0*array(datingLabels), 15.0*array(datingLabels)) plt.show() ''' normMat, ranges, minVals = knn.autoNorm(datingDataMat) print("normMat=", normMat) print("ranges=", ranges) print("minVals=", minVals)
# coding: UTF-8 import matplotlib import matplotlib.pyplot as plt import numpy as np import knn import mnist_test group, labels = knn.createDataSet() bb = knn.classify0([0,0], group, labels, 3) print bb cc = knn.knn2([0,0], group, labels, 3) # 可视化数据 dataSet, labels = knn.file2matrix('data/datingTestSet2.txt') fig = plt.figure() ax = fig.add_subplot(111) ax.scatter(dataSet[:,0], dataSet[:,1], s=15.0*labels, c=15.0*labels) plt.show() # 测试误判率 reload(knn) # testRatio为测试集比例,k为邻居个数 knn.knnTest('../data/datingTestSet2.txt',testRatio=0.2, k=3) # 测试手写数字识别