def __init__(self, symb, predlen, cat='RL', kwargs=None): self.symb = symb self.predlen = predlen self.kwargs = kwargs self.cat = cat if cat == 'RF': if kwargs != None: self.learner = RF.RandomForest(**kwargs) else: self.learner = RF.RandomForest() elif cat == 'KNN': if kwargs != None: self.learner = KNN.KNN(**kwargs) else: self.learner = KNN.KNN() elif cat == 'SVM': if kwargs != None: self.learner = SVM.SVM(**kwargs) else: self.learner = SVM.SVM() elif cat == 'NN': if kwargs != None: self.learner = NN.NN(**kwargs) else: self.learner = NN.NN()
def TestShapeAccuracy(train_images, train_labels, test_images, test_labels, neigh, percentageTrain): limitTrain = int(train_labels.shape[0] * percentageTrain / 100) knn = KNN(train_images[:limitTrain], train_labels[:limitTrain]) preds = knn.predict(test_images, neigh) percentage = Get_shape_accuracy(preds, test_labels) return percentage
def handwritingClassTest(self): hwLabels = [] # 加载训练数据集 trainingFileList = listdir(Config.DATAS + 'KNN/digits/trainingDigits') m = len(trainingFileList) trainingMat = zeros((m, 1024)) for i in range(m): fileNameStr = trainingFileList[i] fileStr = fileNameStr.split('.')[0] # take off .txt classNumStr = int(fileStr.split('_')[0]) hwLabels.append(classNumStr) trainingMat[i, :] = self.img2vector( Config.DATAS + 'KNN/digits/trainingDigits/%s' % fileNameStr) testFileList = listdir( Config.DATAS + 'KNN/digits/testDigits') # iterate through the test set errorCount = 0.0 mTest = len(testFileList) for i in range(mTest): fileNameStr = testFileList[i] fileStr = fileNameStr.split('.')[0] # take off .txt classNumStr = int(fileStr.split('_')[0]) vectorUnderTest = self.img2vector(Config.DATAS + 'KNN/digits/testDigits/%s' % fileNameStr) classifierResult = KNN.KNN().classify(vectorUnderTest, trainingMat, hwLabels, 3) print("the classifier came back with: %d, the real answer is: %d" % (classifierResult, classNumStr)) if (classifierResult != classNumStr): errorCount += 1.0 print("\nthe total number of errors is: %d" % errorCount) print("\nthe total error rate is: %f" % (errorCount / float(mTest)))
def main(k=10, p=2): # 数据载入 data = np.zeros((5000, 32 * 32)) labels = np.zeros((5000, 1)) for i in range(500): for j in range(10): labels[i*10+j, :] = j pic = Image.open('./dataset/%d%d.png' % (i, j)) width = pic.size[0] height = pic.size[1] for x in range(width): for y in range(height): data[i*10+j, x*width+y] = pic.getpixel((x, y)) data_train = data[:4500, :] labels_train = labels[:4500, :] data_test = data[4500:, :] labels_test = labels[4500:, :] # k-NN模型 model = KNN.KNN(k, p) # 预测 n_correct = 0 n_total = 500 for i in range(500): pred = model.classify(data_test[i, :], data_train, labels_train) print('[True Lable]: %d, [Predict Label]: %d' % (int(labels_test[i][0]), pred)) if int(pred) == int(labels_test[i][0]): n_correct += 1 acc = (n_correct / n_total) * 100 print('[Test Accuracy]: %.2f' % acc)
def get_test_error(train, test, k): myKNN = knn.KNN(train) num_errors = 0 for i in range(0, len(test)): if (myKNN.predict(test[i], k) != test[i][0]): num_errors += 1 return num_errors
def get_training_error(train, k): myKNN = knn.KNN(train) num_errors = 0 for i in range(0, len(train)): if (myKNN.predict(train[i], k) != train[i][0]): num_errors += 1 return num_errors
def createClick(self): if self.trainDir != "" and self.validationDir != "" and self.predictDir != "": if self.simpleCNN.isChecked(): self.knn.setEnabled(False) self.customCNN.setEnabled(False) self.chosenAlgorithm = CNN(self.trainDir, self.validationDir, self.predictDir, self.optimizer.currentText(), self.consolePrint) elif self.knn.isChecked(): self.simpleCNN.setEnabled(False) self.customCNN.setEnabled(False) self.chosenAlgorithm = KNN(self.trainDir, self.validationDir) elif self.customCNN.isChecked(): self.simpleCNN.setEnabled(False) self.knn.setEnabled(False) self.chosenAlgorithm = CustomCNN(self.trainDir, self.validationDir, self.predictDir, self.optimizer.currentText(), self.consolePrint) self.chosenAlgorithm.createModel() self.trainButton.setEnabled(True)
def handwritingClassTest(): hwLabels = [] zTrainingFiles = zipfile.ZipFile("datas/trainingDigits.zip") trainingFileList = zTrainingFiles.namelist() del (trainingFileList[0]) print(trainingFileList) #trainingFileList=os.listdir("datas/trainingDigits") m = len(trainingFileList) #文件数量 trainingMat = ny.zeros((m, 1024)) for i in range(m): fileNameStr = trainingFileList[i].split('/')[1] #文件名 fileStr = fileNameStr.split('.')[0] classNumStr = int(fileStr.split('_')[0]) #数字 hwLabels.append(classNumStr) trainingMat[i, :] = img2Vector("datas/trainingDigits/%s" % fileNameStr) testFileList = os.listdir("datas/testDigits") errorCount = 0 mTest = len(testFileList) print(mTest) for j in range(mTest): fileNameStr = testFileList[j] fileStr = fileNameStr.split('.')[0] classNumStr = int(fileStr.split("_")[0]) vectorUnderTest = img2Vector("datas/testDigits/%s" % fileNameStr) classiferResult = KNN.KNN(vectorUnderTest, trainingMat, hwLabels, 3) print("the classifier came back with: %d,the real answer is:%d " % (classiferResult, classNumStr)) if classiferResult != classNumStr: errorCount += 1 print("error text is:" + fileNameStr) return float(errorCount) / float(mTest)
def get_leave_one_out_error(train, k): num_errors = 0 for i in range(0, len(train)): train_minus = np.delete(train, i, axis=0) myKNN = knn.KNN(train_minus) if (myKNN.predict(train[i], k) != train[i][0]): num_errors += 1 return num_errors
def test_fit(self): for ix, (train_imgs, train_labels) in enumerate(self.test_cases['input']): knn = KNN(train_imgs, train_labels) preds = knn.predict(self.test_cases['test_input'][ix][0], self.test_cases['rnd_K'][ix]) np.testing.assert_array_equal(preds, self.test_cases['get_class'][ix])
def test_get_k_neighbours(self): for ix, (train_imgs, train_labels) in enumerate(self.test_cases['input']): knn = KNN(train_imgs, train_labels) knn.get_k_neighbours(self.test_cases['test_input'][ix][0], self.test_cases['rnd_K'][ix]) np.testing.assert_array_equal(knn.neighbors, self.test_cases['get_k_neig'][ix])
def crossValidation(data, label, percent=0.95): ''' 使用训练数据集的一部分作交叉验证 默认以95%的数据作为训练数据,留5%的数据作交叉验证 两部分的数据没有交集,都是从原始训练数据中随机抽样得到 ''' # 获取事物总数 m = len(data) # 获取training set 的数量 numOfTrain = int(m * percent) # 生成所有事物的index indexRange = range(m) # 对index进行采样,作为我们的training set trainIndex = random.sample(indexRange, numOfTrain) # 剩下的作为cross validation set用于交叉验证 crossValidationIndex = set(indexRange).difference(set(trainIndex)) # 将其转换成列表 crossValidationIndex = list(crossValidationIndex) # 用于存放训练数据的feature trainingData = [] # 用于存放训练数据的label trainingLabel = [] for i in trainIndex: trainingData.append(data[i]) trainingLabel.append(label[i]) # 用于存放验证数据的feature crossValidationData = [] # 用于存放验证数据的label crossValidationLabel = [] for i in crossValidationIndex: crossValidationData.append(data[i]) crossValidationLabel.append(label[i]) # 创建一个字典,保存不同k情况下的分类错误率 errorRate = {} # 获取验证数据集的数量 crossValidationDataSize = len(crossValidationData) print '交叉验证训练集:' + str(m - crossValidationDataSize) print '交叉验证验证集:' + str(crossValidationDataSize) # 使用不同的k做crossValidation for k in K_RANGE: knn = KNN.KNN(np.array(trainingData), trainingLabel, k, False) # count为计数器,记录正确分类的事务数 count = 0.0 # 对验证数据进行分类 for i in range(crossValidationDataSize): result = knn.classify(np.array(crossValidationData[i])) # 如果分类正确,则count+1 if result == crossValidationLabel[i]: count += 1 # 计算分类错误率,并建起放入到字典中 errorRate[k] = 1 - float(count / crossValidationDataSize) print 'K=' + str(k) + '时,分类准确率为' + str(errorRate[k]) # 返回交叉验证结果 print errorRate
def run_test(trX, trY,res_file): desired_dt20 = 0.78 desired_dt50 = 0.78 desired_knn1 = 0.70 desired_knn3 = 0.73 print '\n\nFirst, we run DT and KNN on the training/development data to ' print 'ensure that we are getting roughly the right accuracies.' print 'We use the first 80% of the data as training, and the last' print '20% as test.' decTree = DT.DT() res = 1 print '\nDT (cutoff=20)...' sizeX = trX.shape end = int(np.round(sizeX[0]*0.80,decimals=0)) testRun = tt.TrainTest(decTree, trX[:end, :], trY[:end], trX[end:, :], trY[end:], 20) acc = testRun.run_tt() res += testRun.verifyAcc(acc['acc'], desired_dt20) print'\nTrainTime, TestTime', acc['trainTime'], acc['testTime'] res_file.write('\nDT (cutoff=20)') res_file.write('\nTrainTime, TestTime ' + str(acc['trainTime']) + ', ' + str(acc['testTime'])) print '\nDT (cutoff=50)...' testRun = tt.TrainTest(decTree, trX[:end, :], trY[:end], trX[end:sizeX[0], :], trY[end:sizeX[0]], 50) acc = testRun.run_tt() res += testRun.verifyAcc(acc['acc'], desired_dt50) print'\nTrainTime, TestTime', acc['trainTime'], acc['testTime'] res_file.write('\nDT (cutoff=50)') res_file.write('\nTrainTime, TestTime ' + str(acc['trainTime']) + ', ' + str(acc['testTime'])) knnModel = KNN.KNN() print '\nKNN (K=1)' max_size = sizeX[0] if sizeX[0] < 10001 else 10000 end = int(np.round(max_size*0.80,decimals=0)) testRun = tt.TrainTest(knnModel, trX[:end, :], trY[:end], trX[end:sizeX[0], :], trY[end:sizeX[0]], 1) acc = testRun.run_tt() res += testRun.verifyAcc(acc['acc'], desired_knn1) print'\nTrainTime, TestTime', acc['trainTime'], acc['testTime'] res_file.write('\nKNN (K=1)') res_file.write('\nTrainTime, TestTime ' + str(acc['trainTime']) + ', ' + str(acc['testTime'])) print '\nKNN (K=3)' testRun = tt.TrainTest(knnModel, trX[:end, :], trY[:end], trX[end:sizeX[0], :], trY[end:sizeX[0]], 3) acc = testRun.run_tt() res += testRun.verifyAcc(acc['acc'], desired_knn3) print'\nTrainTime, TestTime', acc['trainTime'], acc['testTime'] res_file.write('\nKNN (K=3)') res_file.write('\nTrainTime, TestTime ' + str(acc['trainTime']) + ', ' + str(acc['testTime'])) raw_input('\nPress enter to continue...') return
def _init_classifiers(self): # Initialize classifier objects self.fenc = FreemanEncoder() self.knn = KNN.KNN() self.HMM = HMM.HMM() self.NaiveBayes = NaiveBayes.NaiveBayes() self.RandomForest = RandomForest.RandomForests() self.SVM = svm.SVM_SVC() self.LogisticReg = LogisticReg.LogisticReg() self.AdaBoost = adaboost.AdaBoost() self.GBRT = gbrt.GBRT() #Train initially on the default data set, if no model saved already # Initialize KNN, no saved model for KNN self.knn.knn_train(CharRecognitionGUI_support.training_dataset, 1.0) # Initialize HMM self.HMM.training(CharRecognitionGUI_support.training_dataset) # Initialize Naive Bayes try: pickle.load( open( "./Models/naivebayes_model.p", "rb" ) ) except IOError: self.NaiveBayes.training(CharRecognitionGUI_support.training_dataset) # Initialize Random Forest try: pickle.load( open( "./Models/random_forest.p", "rb" ) ) except IOError: self.RandomForest.training(CharRecognitionGUI_support.training_dataset) # Initialize SVM try: pickle.load( open( "./Models/svm.p", "rb" ) ) except IOError: self.SVM.training(CharRecognitionGUI_support.training_dataset) # Initialize Logistic Regression try: pickle.load( open( "./Models/logistic_model.p", "rb" ) ) except IOError: self.LogisticReg.training(CharRecognitionGUI_support.training_dataset) # Initialize AdaBoost try: pickle.load( open( "./Models/AdaBoostClassifier.p", "rb" ) ) except IOError: self.AdaBoost.training(CharRecognitionGUI_support.training_dataset) # Initialize GBRT try: pickle.load( open( "./Models/GradientBoostingClassifier.p", "rb" ) ) except IOError: self.GBRT.training(CharRecognitionGUI_support.training_dataset)
def Process_air_quality(): X, y = LoadData.load_ozone_data() Experiments.Models_Comparison(X, y, "Air Pollution") op = True DT.DecisionTree(X, y, title="Air Pollution Decision Tree", optimize=op) AB.AdaBoost(X, y, title="Air Pollution AdaBoost", optimize=op) KNN.KNN(X, y, title="Air Pollution KNN", optimize=op) NN.NeuralNetwork(X, y, title="Air Pollution Neural Network", optimize=op) SVM.SVM(X, y, title="Air Pollution SVM", optimize=op)
def retrieval_knn_shape(train_imgs, train_class_labels, K_max): knn = KNN.KNN(train_imgs, train_class_labels) for i in range(2, K_max): labels = knn.predict(test_imgs, i) asserted_label_percentaje, equals = get_shape_accuracy( labels, test_class_labels) visualize_retrieval(test_imgs, 30, labels[:30], equals[:30], title="KNN shape with K: " + str(i))
def datingClassTest(): hoRatio = 0.10 #测试样本占总样本的比例 k = 7 # datingDataMat, datingLabels = getData.file2Matrix( '../datas/datingTestSet2.txt') normMat = normalization.normalize(datingDataMat) m = normMat.shape[0] #行数 numTestVecs = int(m * hoRatio) #测试样本数 errorCount = 0 #错误样本数 for i in range(numTestVecs): classifierResult = KNN.KNN(normMat[i, :], normMat[numTestVecs:m, :], datingLabels[numTestVecs:m], k) if (classifierResult != datingLabels[i]): errorCount += 1 return float(errorCount) / float(numTestVecs)
def topNRecs(): #Get Entries csv_file_name = entry.get() string = entry2.get() #User selections numNeighbors = int(string[0]) selectedPoint = int(string[2]) #Call KNN class and get the neighbors knnClient = KNN(csv_file_name, selectedPoint, numNeighbors) list_of_Neighbors = knnClient.driver() #Catch bad input from a bad csv if list_of_Neighbors is None: out = "Warning: No Neighbors. Edit the CSV or try again." else: out = "Results are in the txt file" #Dump the results to a file with open('output.txt', 'w') as f: for neighbor in list_of_Neighbors: f.write("%s\n" % str(neighbor)) #Transform the csv into a dict of dicts reader = csv.DictReader(open(csv_file_name)) dict_list = [] for line in reader: dict_list.append(line) #Store the csv in a txt file with appropriate headers for better analysis with open('dict.txt', 'w') as d: for dictonary in dict_list: d.write("%s\n" % str(dictonary)) #print(dict_list) list_of_Neighbors = [[str(str(j)) for j in i] for i in list_of_Neighbors] res = [''.join(ele) for ele in list_of_Neighbors] #Output windows title = tk.Label(root, text=out, font=('helvetica', 10)) canvas.create_window(220, 80, window=title) #More output windows suggesting the user try another csv file ans = tk.Label(root, text="Try another txt file below!", font=('helvetica', 15)) canvas.create_window(250, 300, window=ans)
def Process_wine_quality(): t0 = time.time() X, y = LoadData.load_wine_quality_data() op = True Experiments.Models_Comparison(X, y, "Wine Quality") DT.DecisionTree(X, y, title="Wine Quality Decision Tree", optimize=op) AB.AdaBoost(X, y, title="Wine Quality AdaBoost", optimize=op) KNN.KNN(X, y, title="Wine Quality KNN", optimize=op) NN.NeuralNetwork(X, y, title="Wine Quality Neural Network", optimize=op) SVM.SVM(X, y, title="Wine Quality SVM", optimize=op) t1 = time.time() print("total TIME") print(t1 - t0)
def Test_Retrieval_by_shape(test_images, neigh, forma, cantidad): check = [] knn = KNN(train_imgs, train_class_labels) result_knn = knn.predict(test_images, neigh) index_image = Retrieval_by_shape(result_knn, forma, cantidad) for i in index_image: if result_knn[i] == test_class_labels[i]: check.append(True) else: check.append(False) GroundTruth = test_class_labels[index_image] titol = "Retrieval_by_shape Query:" + str(forma) visualize_retrieval(test_images[index_image], cantidad, info=GroundTruth, ok=check, title=titol)
def identify(file): x_list = np.load('C:/Users/Jaqen/Desktop/matrix_list.npy') Y = np.load('C:/Users/Jaqen/Desktop/class_list.npy') knn = KNN.KNN() knn.fit(x_list, Y, 10) img = Image.open(denoise.process(file)) letters = cut.cut(img) pre_letters = '' for i, j in letters: im = img.crop((i, 0, j, img.size[1])) X = norm.normalize(im) pca = norm.PCA(X, n_components=3) #降维 X_ = pca.reduce_dim() predic = knn.predict(X_) pre_letters += predic return pre_letters
def main(): data_set = np.loadtxt('haberman.csv', dtype=np.float, delimiter=',', skiprows=1) best_k = int(KNN.get_KNNbestK(data_set)) recallknn, fscoreknn, gmeanknn,TP, FN, TN, FP = KNN.KNN(data_set, best_k) recallwknn, fscorewknn, gmeanwknn,TP1, FN1, TN1, FP1 = WKNN.WKNN(data_set, best_k) recallpknn, fscorepknn, gmeanpknn,TP2, FN2, TN2, FP2 = PTM_KNN.PTM_KNN(data_set) recallpwknn, fscorepwknn, gmeanpwknn,TP3, FN3, TN3, FP3 = PTM_WKNN.PTM_WKNN(data_set) recallnpwknn, fscorenpwknn, gmeannpwknn, TP4, FN4, TN4, FP4 = NEW_PTM_WKNN.NEW_PTM_WKNN(data_set) print(TP, FN, TN, FP) print(TP1, FN1, TN1, FP1) print(TP2, FN2, TN2, FP2) print(TP3, FN3, TN3, FP3) print(TP4, FN4, TN4, FP4) class1, class1_number = get_positive_class(data_set) rects = plt.barh(class1, class1_number) plt.title('正反类分布') for rect in rects: width = rect.get_width() plt.text(width, rect.get_y() + rect.get_height() / 2, str(width), ha='center', va='bottom') plt.show() buy_number = [int(recallknn*10000)/100, int(recallwknn*10000)/100, int(recallpknn*10000)/100, int(recallpwknn*10000)/100, int(recallnpwknn*10000)/100] buy_number2 = [int(fscoreknn*10000)/100, int(fscorewknn*10000)/100, int(fscorepknn*10000)/100, int(fscorepwknn*10000)/100, int(fscorenpwknn*10000)/100] buy_number3 = [int(gmeanknn*10000)/100, int(gmeanwknn*10000)/100, int(gmeanpknn*10000)/100, int(gmeanpwknn*10000)/100, int(gmeannpwknn*10000)/100] name = ['KNN', 'WKNN', 'PTM-KNN', 'PTM-WKNN', 'NEW_PTM-WKNN'] total_width, n = 2, 3 width = total_width / n x = [0, 2.5, 5, 7.5, 10] a = plt.bar(x, buy_number, width=width, label='Recall', fc='y') for i in range(len(x)): x[i] = x[i] + width b = plt.bar(x, buy_number2, width=width, label='F-score', tick_label=name, fc='r') for i in range(len(x)): x[i] = x[i] + width c = plt.bar(x, buy_number3, width=width, label='G-mean', fc='b') autolabel(a) autolabel(b) autolabel(c) # plt.xlabel('算法') plt.ylabel('百分比(%)') plt.title('实验结果') plt.legend() plt.show()
def runPipeline(self, seeDiff=True, k =10): filtered = self.data.filter(regex=('_\d')).copy() filtered.fillna('NA', inplace=True) agg_filtered = makeAggregate(filtered) print("Running pipeline length", len(pipeline)) #print("Start distribution", getDistribution(agg_filtered)) distributions= [] for number, imputation in enumerate(pipeline): print("Imputation number", number+1) print("Pattern: ", imputation[0], "Imputation: ", imputation[1]) patternMatcher = PatternMatcher(agg_filtered, imputation[0]) indexes = patternMatcher.returnIndex() print(len(indexes), "of pattern found") agg_filtered = patternMatcher.imputePattern(imputation[1]) #print("Distribution of Imputation number", number+1) #print('\n',getDistribution(agg_filtered)) distributions.append(getDistribution(agg_filtered)) retVal = unAggregate(agg_filtered, self.data) retVal = retVal.replace({'NA':np.nan}) if (self.run_knn): for_knn = retVal for_knn = for_knn.filter(regex=('_\d')) knn_imputed = KNN(k=k).complete(for_knn) knn_imputed = pd.DataFrame(knn_imputed) knn_imputed.columns = for_knn.columns knn_imputed = knn_imputed.applymap(bar) diff = set(self.data.columns).difference(set(self.data.filter(regex=('_\d')).columns)) for i in diff: knn_imputed[i] = self.data[i] retVal = knn_imputed if (seeDiff): return retVal, distributions else: return retVal
def performKNN(inputDataClass, nearestNeighbours, mode, label_with_distance=False): covar = -1 if mode == 3: covar = performanceAnalyser.getFullCovariance( inputDataClass.Train[:, :-1]) knn = KNN.KNN(nearestNeighbours, inputDataClass.Train[:, :-1], inputDataClass.Test[:, :-1], inputDataClass.Train[:, -1], label_with_distance=label_with_distance, mode=mode, covar=covar) knn.allocate() Ypred = knn.labels Ytrue = inputDataClass.Test[:, -1] print("Testing Accuracy = " + str(performanceAnalyser.calcAccuracyTotal(Ypred, Ytrue)))
def Main(): horatio = 0.1 trainset, trainlabel = ud.get_file_matrix('trainset.txt') trainset = np.array(ud.normalized(trainset)) fiter = int(len(trainlabel) * horatio) out = model.KNN(trainset[:fiter], trainset[fiter:], trainlabel[fiter:], k=3) acc, precision, recall, F1 = ud.get_Precision_Recall( out, trainlabel[:fiter]) hl = hamming_loss(trainlabel[:fiter], out) print("hamming_loss:{}".format(hl)) print("Acc:{}".format(acc)) print("Precision:{}".format(precision)) print("Reacall:{}".format(recall)) print("F1:{}".format(F1)) target_names = ['class 0', 'class 1', 'class 2'] print(classification_report(trainlabel[:fiter], out))
def get_knn_accuracy_(train_imgs, train_class_labels, K_max): plt.clf() knn = KNN.KNN(train_imgs, train_class_labels) distances_to_use = ["euclidean", "cityblock"] print("starting knn with next heuristics for distance: ", distances_to_use) print("estimated time 6 mins") for distance in distances_to_use: time.sleep(3) percentages_returned = [] time1 = time.time() for i in range(2, K_max): labels = knn.predict(test_imgs, i, distance) asserted_label_percentaje, equals = get_shape_accuracy( labels, test_class_labels) plt.clf() plt.scatter(test_class_labels.tolist(), labels.tolist()) plt.title("expected shape VS actual shape with K = " + str(i)) plt.xlabel("test class labels") plt.ylabel("asserted labels") plt.grid() plt.savefig(output_folder + "formas encontradasK=" + str(i) + distance + ".png") percentages_returned.append(asserted_label_percentaje) print(distance + " finished in: ", time.time() - time1) # Graph plt.clf() plt.scatter(list(range(2, K_max)), percentages_returned) plt.title("KNN % " + distance + " success") plt.xlabel("K") plt.ylabel("%") plt.savefig(output_folder + "porcentaje " + distance + ".png")
def choix_classifieurs(self, X_train, y_train, X_test, y_test): print( " \n\t\t--- Recherche des meilleurs classifieurs pour chaque méthode ---\n\n" ) #Choix des classifieurs print(" --- Recherche pour Naive Bayes ---\n") #Naive Bayes nB = nb.NaiveBayes() clfNB = nB.choixNB(X_train, y_train, X_test, y_test) #Arbre de décision print(" --- Recherche pour Arbre de Decision ---\n") tree = dt.DecisionTree() clfTree, _ = tree.recherche_param(X_train, y_train, X_test, y_test) #K plus proches voisins print( "\n --- Pas de recherche de paramètres pour les K plus proches voisins ---\n" ) kNN = knn.KNN() #SVM print(" --- Recherche pour la SVM ---\n") sVM = svm.SVM() clfSVM = sVM.hyperParameter(X_train, y_train) #Perceptron print(" --- Recherche pour le Perceptron ---\n") perceptron = perceptr.Perceptr() clfPerceptr = perceptron.rechercheHypParm(X_train, y_train, X_test, y_test) return (clfNB, clfTree, kNN, clfPerceptr, clfSVM)
print(type(buying[0])) raw_x = list(zip(buying, maint, door, persons, lug_boot, safety)) raw_y = list(cls) raw_x = np.array(raw_x, dtype = np.float64) raw_y = np.array(raw_y) x_train, x_test, y_train, y_test = train_test_split(raw_x, raw_y, random_state= 1, test_size = 0.2) # apply KNN accuracy = {'method' : 'mahattan_distance'} for i in range(1,101): model = KNN(i) model.fit(x_train, y_train, accuracy['method']) predictions = model.predict(x_test) #print("Model evaluation ") acc, con_matrix = model.evaluate(y_test, predictions) #print("Accuracy: {}".format(acc)) #print("Confusion matrix") #print(con_matrix) accuracy.update({i : acc}) import json with open("norm1.json", 'w') as file:
''' import numpy as np import DT as dt import KNN as knn if __name__ == '__main__': print 'running tests on DT and KNN' #This is the class example [mathy, test >= 80, project >= 80, early] #with a slight change so that non-mathy first splits on early. trX=np.array([[1,1,1,1],[1,1,1,0],[0,1,0,1],[0,0,1,1],[0,0,1,1],[0,0,0,0],[0,0,0,0],[1,0,1,1],[1,0,0,1],[0,0,1,1],[1,0,0,0],[0,0,1,1],[0,1,0,1],[0,0,1,0]]) trY=np.array([[1],[1],[0],[0],[0],[1],[0],[1],[0],[0],[0],[0],[0],[1]]) deX = np.array([[0,1,0,0],[0,0,1,0],[0,1,1,1]]) deY = np.array([[0],[1],[0]]) decTree = dt.DT() print 'DT, cutoff=0' trainModel = decTree.res('train',X=trX,Y=trY,h_param=0) decTree.DTdraw(trainModel) output = decTree.res('predict',model=trainModel,test_case=deX) print output knnMode = knn.KNN() print 'KNN, k=1' trainModel = knnMode.res('train',X=trX,Y=trY,h_param=1) output = knnMode.res('predict',model=trainModel,test_case=deX) print output print 'Done'
if dec not in results.keys(): print '\nNow we vary the cutoff for the decision tree and see how it affects accuracy...' thresh = [5,10,20,40,80,160] decTree = DT.DT() res = run_comps(decTree, thresh, trX[0:4800, :], trY[0:4800], trX[4801:6000, :], trY[4801:6000],"Figure 2: DT cutoff versus accuracy (MNIST)","DT cutoff","../figure2.png") results[dec] = res res_file.write('\n' + dec + '\n') res_file.write(str(res)) raw_input('Press enter to continue...') neigh = 'knn'+data_types[i] if neigh not in results.keys(): print '\nNow we vary the k for the KNN classifier and see how it affects accuracy...' allK = [1,8,16,32,64,128] knnModel = KNN.KNN() res = run_comps(knnModel, allK, trX[0:2000, :], trY[0:2000], trX[2001:2501, :], trY[2001:2501],"Figure 3: KNN count versus accuracy (MNIST)","KNN count","../figure3.png") results[neigh] = res res_file.write('\n' + neigh + '\n') res_file.write(str(res)) raw_input('Press enter to continue...') heldDT = 'hoDT_'+data_types[i] if heldDT not in results.keys(): print '\nNow we make predictions on dev and test data using the best DT' thresh = [5,10,20,40,80,160] dtres = 'dt' + data_types[i] dtAccs = results[dtres] bestDT = np.argmax(dtAccs) decTree = DT.DT()