コード例 #1
0
    def __init__(self, symb, predlen, cat='RL', kwargs=None):

        self.symb = symb
        self.predlen = predlen
        self.kwargs = kwargs
        self.cat = cat

        if cat == 'RF':
            if kwargs != None:
                self.learner = RF.RandomForest(**kwargs)
            else:
                self.learner = RF.RandomForest()

        elif cat == 'KNN':
            if kwargs != None:
                self.learner = KNN.KNN(**kwargs)
            else:
                self.learner = KNN.KNN()

        elif cat == 'SVM':
            if kwargs != None:
                self.learner = SVM.SVM(**kwargs)
            else:
                self.learner = SVM.SVM()

        elif cat == 'NN':
            if kwargs != None:
                self.learner = NN.NN(**kwargs)
            else:
                self.learner = NN.NN()
コード例 #2
0
def TestShapeAccuracy(train_images, train_labels, test_images, test_labels,
                      neigh, percentageTrain):
    limitTrain = int(train_labels.shape[0] * percentageTrain / 100)
    knn = KNN(train_images[:limitTrain], train_labels[:limitTrain])
    preds = knn.predict(test_images, neigh)
    percentage = Get_shape_accuracy(preds, test_labels)
    return percentage
コード例 #3
0
 def handwritingClassTest(self):
     hwLabels = []
     # 加载训练数据集
     trainingFileList = listdir(Config.DATAS + 'KNN/digits/trainingDigits')
     m = len(trainingFileList)
     trainingMat = zeros((m, 1024))
     for i in range(m):
         fileNameStr = trainingFileList[i]
         fileStr = fileNameStr.split('.')[0]  # take off .txt
         classNumStr = int(fileStr.split('_')[0])
         hwLabels.append(classNumStr)
         trainingMat[i, :] = self.img2vector(
             Config.DATAS + 'KNN/digits/trainingDigits/%s' % fileNameStr)
     testFileList = listdir(
         Config.DATAS +
         'KNN/digits/testDigits')  # iterate through the test set
     errorCount = 0.0
     mTest = len(testFileList)
     for i in range(mTest):
         fileNameStr = testFileList[i]
         fileStr = fileNameStr.split('.')[0]  # take off .txt
         classNumStr = int(fileStr.split('_')[0])
         vectorUnderTest = self.img2vector(Config.DATAS +
                                           'KNN/digits/testDigits/%s' %
                                           fileNameStr)
         classifierResult = KNN.KNN().classify(vectorUnderTest, trainingMat,
                                               hwLabels, 3)
         print("the classifier came back with: %d, the real answer is: %d" %
               (classifierResult, classNumStr))
         if (classifierResult != classNumStr): errorCount += 1.0
     print("\nthe total number of errors is: %d" % errorCount)
     print("\nthe total error rate is: %f" % (errorCount / float(mTest)))
コード例 #4
0
def main(k=10, p=2):
	# 数据载入
	data = np.zeros((5000, 32 * 32))
	labels = np.zeros((5000, 1))
	for i in range(500):
		for j in range(10):
			labels[i*10+j, :] = j
			pic = Image.open('./dataset/%d%d.png' % (i, j))
			width = pic.size[0]
			height = pic.size[1]
			for x in range(width):
				for y in range(height):
					data[i*10+j, x*width+y] = pic.getpixel((x, y))
	data_train = data[:4500, :]
	labels_train = labels[:4500, :]
	data_test = data[4500:, :]
	labels_test = labels[4500:, :]
	# k-NN模型
	model = KNN.KNN(k, p)
	# 预测
	n_correct = 0
	n_total = 500
	for i in range(500):
		pred = model.classify(data_test[i, :], data_train, labels_train)
		print('[True Lable]: %d, [Predict Label]: %d' % (int(labels_test[i][0]), pred))
		if int(pred) == int(labels_test[i][0]):
			n_correct += 1
	acc = (n_correct / n_total) * 100
	print('[Test Accuracy]: %.2f' % acc)
コード例 #5
0
def get_test_error(train, test, k):
    myKNN = knn.KNN(train)
    num_errors = 0
    for i in range(0, len(test)):
        if (myKNN.predict(test[i], k) != test[i][0]):
            num_errors += 1
    return num_errors
コード例 #6
0
def get_training_error(train, k):
    myKNN = knn.KNN(train)
    num_errors = 0
    for i in range(0, len(train)):
        if (myKNN.predict(train[i], k) != train[i][0]):
            num_errors += 1
    return num_errors
コード例 #7
0
    def createClick(self):
        if self.trainDir != "" and self.validationDir != "" and self.predictDir != "":
            if self.simpleCNN.isChecked():
                self.knn.setEnabled(False)
                self.customCNN.setEnabled(False)

                self.chosenAlgorithm = CNN(self.trainDir, self.validationDir,
                                           self.predictDir,
                                           self.optimizer.currentText(),
                                           self.consolePrint)

            elif self.knn.isChecked():
                self.simpleCNN.setEnabled(False)
                self.customCNN.setEnabled(False)

                self.chosenAlgorithm = KNN(self.trainDir, self.validationDir)

            elif self.customCNN.isChecked():
                self.simpleCNN.setEnabled(False)
                self.knn.setEnabled(False)

                self.chosenAlgorithm = CustomCNN(self.trainDir,
                                                 self.validationDir,
                                                 self.predictDir,
                                                 self.optimizer.currentText(),
                                                 self.consolePrint)

            self.chosenAlgorithm.createModel()
            self.trainButton.setEnabled(True)
コード例 #8
0
def handwritingClassTest():
    hwLabels = []
    zTrainingFiles = zipfile.ZipFile("datas/trainingDigits.zip")
    trainingFileList = zTrainingFiles.namelist()
    del (trainingFileList[0])
    print(trainingFileList)
    #trainingFileList=os.listdir("datas/trainingDigits")
    m = len(trainingFileList)  #文件数量
    trainingMat = ny.zeros((m, 1024))
    for i in range(m):
        fileNameStr = trainingFileList[i].split('/')[1]  #文件名
        fileStr = fileNameStr.split('.')[0]
        classNumStr = int(fileStr.split('_')[0])  #数字
        hwLabels.append(classNumStr)
        trainingMat[i, :] = img2Vector("datas/trainingDigits/%s" % fileNameStr)
    testFileList = os.listdir("datas/testDigits")
    errorCount = 0
    mTest = len(testFileList)
    print(mTest)
    for j in range(mTest):
        fileNameStr = testFileList[j]
        fileStr = fileNameStr.split('.')[0]
        classNumStr = int(fileStr.split("_")[0])
        vectorUnderTest = img2Vector("datas/testDigits/%s" % fileNameStr)
        classiferResult = KNN.KNN(vectorUnderTest, trainingMat, hwLabels, 3)
        print("the classifier came back with: %d,the real answer is:%d " %
              (classiferResult, classNumStr))
        if classiferResult != classNumStr:
            errorCount += 1
            print("error text is:" + fileNameStr)
    return float(errorCount) / float(mTest)
コード例 #9
0
def get_leave_one_out_error(train, k):
    num_errors = 0
    for i in range(0, len(train)):
        train_minus = np.delete(train, i, axis=0)
        myKNN = knn.KNN(train_minus)
        if (myKNN.predict(train[i], k) != train[i][0]):
            num_errors += 1
    return num_errors
コード例 #10
0
 def test_fit(self):
     for ix, (train_imgs,
              train_labels) in enumerate(self.test_cases['input']):
         knn = KNN(train_imgs, train_labels)
         preds = knn.predict(self.test_cases['test_input'][ix][0],
                             self.test_cases['rnd_K'][ix])
         np.testing.assert_array_equal(preds,
                                       self.test_cases['get_class'][ix])
コード例 #11
0
 def test_get_k_neighbours(self):
     for ix, (train_imgs,
              train_labels) in enumerate(self.test_cases['input']):
         knn = KNN(train_imgs, train_labels)
         knn.get_k_neighbours(self.test_cases['test_input'][ix][0],
                              self.test_cases['rnd_K'][ix])
         np.testing.assert_array_equal(knn.neighbors,
                                       self.test_cases['get_k_neig'][ix])
コード例 #12
0
ファイル: main.py プロジェクト: MaWeiWri/Kaggle
def crossValidation(data, label, percent=0.95):
    '''
    使用训练数据集的一部分作交叉验证
    默认以95%的数据作为训练数据,留5%的数据作交叉验证
    两部分的数据没有交集,都是从原始训练数据中随机抽样得到
    '''
    # 获取事物总数
    m = len(data)
    # 获取training set 的数量
    numOfTrain = int(m * percent)
    # 生成所有事物的index
    indexRange = range(m)

    # 对index进行采样,作为我们的training set
    trainIndex = random.sample(indexRange, numOfTrain)
    # 剩下的作为cross validation set用于交叉验证
    crossValidationIndex = set(indexRange).difference(set(trainIndex))
    # 将其转换成列表
    crossValidationIndex = list(crossValidationIndex)

    # 用于存放训练数据的feature
    trainingData = []
    # 用于存放训练数据的label
    trainingLabel = []
    for i in trainIndex:
        trainingData.append(data[i])
        trainingLabel.append(label[i])

    # 用于存放验证数据的feature
    crossValidationData = []
    # 用于存放验证数据的label
    crossValidationLabel = []
    for i in crossValidationIndex:
        crossValidationData.append(data[i])
        crossValidationLabel.append(label[i])
    # 创建一个字典,保存不同k情况下的分类错误率
    errorRate = {}
    # 获取验证数据集的数量
    crossValidationDataSize = len(crossValidationData)
    print '交叉验证训练集:' + str(m - crossValidationDataSize)
    print '交叉验证验证集:' + str(crossValidationDataSize)
    # 使用不同的k做crossValidation
    for k in K_RANGE:
        knn = KNN.KNN(np.array(trainingData), trainingLabel, k, False)
        # count为计数器,记录正确分类的事务数
        count = 0.0
        # 对验证数据进行分类
        for i in range(crossValidationDataSize):
            result = knn.classify(np.array(crossValidationData[i]))
            # 如果分类正确,则count+1
            if result == crossValidationLabel[i]:
                count += 1
        # 计算分类错误率,并建起放入到字典中
        errorRate[k] = 1 - float(count / crossValidationDataSize)
        print 'K=' + str(k) + '时,分类准确率为' + str(errorRate[k])
    # 返回交叉验证结果
    print errorRate
コード例 #13
0
def run_test(trX, trY,res_file):
    desired_dt20 = 0.78
    desired_dt50 = 0.78
    desired_knn1 = 0.70
    desired_knn3 = 0.73
    
    print '\n\nFirst, we run DT and KNN on the training/development data to '
    print 'ensure that we are getting roughly the right accuracies.'
    print 'We use the first 80% of the data as training, and the last'
    print '20% as test.'
    
    
    decTree = DT.DT()
    res = 1

    print '\nDT (cutoff=20)...'
    sizeX = trX.shape
    end = int(np.round(sizeX[0]*0.80,decimals=0))
    testRun = tt.TrainTest(decTree, trX[:end, :], trY[:end], trX[end:, :], trY[end:], 20)
    acc = testRun.run_tt()
    res += testRun.verifyAcc(acc['acc'], desired_dt20)
    print'\nTrainTime, TestTime', acc['trainTime'], acc['testTime']
    res_file.write('\nDT (cutoff=20)')
    res_file.write('\nTrainTime, TestTime ' + str(acc['trainTime']) + ', ' + str(acc['testTime']))
 
    print '\nDT (cutoff=50)...'
    testRun = tt.TrainTest(decTree, trX[:end, :], trY[:end], trX[end:sizeX[0], :], trY[end:sizeX[0]], 50)
    acc = testRun.run_tt()
    res += testRun.verifyAcc(acc['acc'], desired_dt50)
    print'\nTrainTime, TestTime', acc['trainTime'], acc['testTime']
    res_file.write('\nDT (cutoff=50)')
    res_file.write('\nTrainTime, TestTime ' + str(acc['trainTime']) + ', ' + str(acc['testTime']))
    
    knnModel = KNN.KNN()
    print '\nKNN (K=1)'
    max_size = sizeX[0] if sizeX[0] < 10001 else 10000
    end = int(np.round(max_size*0.80,decimals=0)) 
    testRun = tt.TrainTest(knnModel, trX[:end, :], trY[:end], trX[end:sizeX[0], :], trY[end:sizeX[0]], 1)
    acc = testRun.run_tt()
    res += testRun.verifyAcc(acc['acc'], desired_knn1)
    print'\nTrainTime, TestTime', acc['trainTime'], acc['testTime']
    res_file.write('\nKNN (K=1)')
    res_file.write('\nTrainTime, TestTime ' + str(acc['trainTime']) + ', ' + str(acc['testTime']))
 
    print '\nKNN (K=3)'
    testRun = tt.TrainTest(knnModel, trX[:end, :], trY[:end], trX[end:sizeX[0], :], trY[end:sizeX[0]], 3)
    acc = testRun.run_tt()
    res += testRun.verifyAcc(acc['acc'], desired_knn3)
    print'\nTrainTime, TestTime', acc['trainTime'], acc['testTime']
    res_file.write('\nKNN (K=3)')
    res_file.write('\nTrainTime, TestTime ' + str(acc['trainTime']) + ', ' + str(acc['testTime']))

    raw_input('\nPress enter to continue...')
    
    return
コード例 #14
0
 def _init_classifiers(self):
     # Initialize classifier objects
     self.fenc = FreemanEncoder()
     self.knn = KNN.KNN()
     self.HMM = HMM.HMM()
     self.NaiveBayes = NaiveBayes.NaiveBayes()
     self.RandomForest = RandomForest.RandomForests()
     self.SVM = svm.SVM_SVC()
     self.LogisticReg = LogisticReg.LogisticReg()
     self.AdaBoost = adaboost.AdaBoost()
     self.GBRT = gbrt.GBRT()
     
     #Train initially on the default data set, if no model saved already
     
     # Initialize KNN, no saved model for KNN
     self.knn.knn_train(CharRecognitionGUI_support.training_dataset, 1.0)
     
     # Initialize HMM
     self.HMM.training(CharRecognitionGUI_support.training_dataset)
     
     # Initialize Naive Bayes
     try:
         pickle.load( open( "./Models/naivebayes_model.p", "rb" ) )
     except IOError:
         self.NaiveBayes.training(CharRecognitionGUI_support.training_dataset)
     
     # Initialize Random Forest
     try:
         pickle.load( open( "./Models/random_forest.p", "rb" ) )
     except IOError:
         self.RandomForest.training(CharRecognitionGUI_support.training_dataset)
     
     # Initialize SVM
     try:
         pickle.load( open( "./Models/svm.p", "rb" ) )
     except IOError:
         self.SVM.training(CharRecognitionGUI_support.training_dataset)
     
     # Initialize Logistic Regression
     try:
         pickle.load( open( "./Models/logistic_model.p", "rb" ) )
     except IOError:
         self.LogisticReg.training(CharRecognitionGUI_support.training_dataset)
         
     # Initialize AdaBoost
     try:
         pickle.load( open( "./Models/AdaBoostClassifier.p", "rb" ) )
     except IOError:
         self.AdaBoost.training(CharRecognitionGUI_support.training_dataset)
         
     # Initialize GBRT
     try:
         pickle.load( open( "./Models/GradientBoostingClassifier.p", "rb" ) )
     except IOError:
         self.GBRT.training(CharRecognitionGUI_support.training_dataset)
コード例 #15
0
def Process_air_quality():
    X, y = LoadData.load_ozone_data()

    Experiments.Models_Comparison(X, y, "Air Pollution")
    op = True

    DT.DecisionTree(X, y, title="Air Pollution Decision Tree", optimize=op)
    AB.AdaBoost(X, y, title="Air Pollution AdaBoost", optimize=op)
    KNN.KNN(X, y, title="Air Pollution KNN", optimize=op)
    NN.NeuralNetwork(X, y, title="Air Pollution Neural Network", optimize=op)
    SVM.SVM(X, y, title="Air Pollution SVM", optimize=op)
コード例 #16
0
def retrieval_knn_shape(train_imgs, train_class_labels, K_max):
    knn = KNN.KNN(train_imgs, train_class_labels)

    for i in range(2, K_max):
        labels = knn.predict(test_imgs, i)
        asserted_label_percentaje, equals = get_shape_accuracy(
            labels, test_class_labels)
        visualize_retrieval(test_imgs,
                            30,
                            labels[:30],
                            equals[:30],
                            title="KNN shape with K: " + str(i))
コード例 #17
0
def datingClassTest():
    hoRatio = 0.10  #测试样本占总样本的比例
    k = 7  #
    datingDataMat, datingLabels = getData.file2Matrix(
        '../datas/datingTestSet2.txt')
    normMat = normalization.normalize(datingDataMat)
    m = normMat.shape[0]  #行数
    numTestVecs = int(m * hoRatio)  #测试样本数
    errorCount = 0  #错误样本数
    for i in range(numTestVecs):
        classifierResult = KNN.KNN(normMat[i, :], normMat[numTestVecs:m, :],
                                   datingLabels[numTestVecs:m], k)
        if (classifierResult != datingLabels[i]):
            errorCount += 1
    return float(errorCount) / float(numTestVecs)
コード例 #18
0
def topNRecs():
    #Get Entries
    csv_file_name = entry.get()
    string = entry2.get()

    #User selections
    numNeighbors = int(string[0])
    selectedPoint = int(string[2])

    #Call KNN class and get the neighbors
    knnClient = KNN(csv_file_name, selectedPoint, numNeighbors)
    list_of_Neighbors = knnClient.driver()

    #Catch bad input from a bad csv
    if list_of_Neighbors is None:
        out = "Warning: No Neighbors. Edit the CSV or try again."
    else:
        out = "Results are in the txt file"

    #Dump the results to a file
    with open('output.txt', 'w') as f:
        for neighbor in list_of_Neighbors:
            f.write("%s\n" % str(neighbor))

    #Transform the csv into a dict of dicts
    reader = csv.DictReader(open(csv_file_name))
    dict_list = []
    for line in reader:
        dict_list.append(line)

    #Store the csv in a txt file with appropriate headers for better analysis
    with open('dict.txt', 'w') as d:
        for dictonary in dict_list:
            d.write("%s\n" % str(dictonary))

    #print(dict_list)
    list_of_Neighbors = [[str(str(j)) for j in i] for i in list_of_Neighbors]
    res = [''.join(ele) for ele in list_of_Neighbors]

    #Output windows
    title = tk.Label(root, text=out, font=('helvetica', 10))
    canvas.create_window(220, 80, window=title)

    #More output windows suggesting the user try another csv file
    ans = tk.Label(root,
                   text="Try another txt file below!",
                   font=('helvetica', 15))
    canvas.create_window(250, 300, window=ans)
コード例 #19
0
def Process_wine_quality():

    t0 = time.time()
    X, y = LoadData.load_wine_quality_data()
    op = True

    Experiments.Models_Comparison(X, y, "Wine Quality")

    DT.DecisionTree(X, y, title="Wine Quality Decision Tree", optimize=op)
    AB.AdaBoost(X, y, title="Wine Quality AdaBoost", optimize=op)
    KNN.KNN(X, y, title="Wine Quality KNN", optimize=op)
    NN.NeuralNetwork(X, y, title="Wine Quality Neural Network", optimize=op)
    SVM.SVM(X, y, title="Wine Quality SVM", optimize=op)

    t1 = time.time()
    print("total TIME")
    print(t1 - t0)
コード例 #20
0
def Test_Retrieval_by_shape(test_images, neigh, forma, cantidad):
    check = []
    knn = KNN(train_imgs, train_class_labels)
    result_knn = knn.predict(test_images, neigh)
    index_image = Retrieval_by_shape(result_knn, forma, cantidad)
    for i in index_image:
        if result_knn[i] == test_class_labels[i]:
            check.append(True)
        else:
            check.append(False)
    GroundTruth = test_class_labels[index_image]
    titol = "Retrieval_by_shape   Query:" + str(forma)
    visualize_retrieval(test_images[index_image],
                        cantidad,
                        info=GroundTruth,
                        ok=check,
                        title=titol)
def identify(file):

    x_list = np.load('C:/Users/Jaqen/Desktop/matrix_list.npy')
    Y = np.load('C:/Users/Jaqen/Desktop/class_list.npy')
    knn = KNN.KNN()
    knn.fit(x_list, Y, 10)
    img = Image.open(denoise.process(file))
    letters = cut.cut(img)
    pre_letters = ''
    for i, j in letters:
        im = img.crop((i, 0, j, img.size[1]))
        X = norm.normalize(im)
        pca = norm.PCA(X, n_components=3)  #降维
        X_ = pca.reduce_dim()
        predic = knn.predict(X_)
        pre_letters += predic
    return pre_letters
コード例 #22
0
def main():
    data_set = np.loadtxt('haberman.csv', dtype=np.float, delimiter=',', skiprows=1)
    best_k = int(KNN.get_KNNbestK(data_set))
    recallknn, fscoreknn, gmeanknn,TP, FN, TN, FP = KNN.KNN(data_set, best_k)
    recallwknn, fscorewknn, gmeanwknn,TP1, FN1, TN1, FP1 = WKNN.WKNN(data_set, best_k)
    recallpknn, fscorepknn, gmeanpknn,TP2, FN2, TN2, FP2 = PTM_KNN.PTM_KNN(data_set)
    recallpwknn, fscorepwknn, gmeanpwknn,TP3, FN3, TN3, FP3 = PTM_WKNN.PTM_WKNN(data_set)
    recallnpwknn, fscorenpwknn, gmeannpwknn, TP4, FN4, TN4, FP4 = NEW_PTM_WKNN.NEW_PTM_WKNN(data_set)
    print(TP, FN, TN, FP)
    print(TP1, FN1, TN1, FP1)
    print(TP2, FN2, TN2, FP2)
    print(TP3, FN3, TN3, FP3)
    print(TP4, FN4, TN4, FP4)
    class1, class1_number = get_positive_class(data_set)
    rects = plt.barh(class1, class1_number)
    plt.title('正反类分布')
    for rect in rects:
        width = rect.get_width()
        plt.text(width, rect.get_y() + rect.get_height() / 2, str(width), ha='center', va='bottom')
    plt.show()

    buy_number = [int(recallknn*10000)/100, int(recallwknn*10000)/100, int(recallpknn*10000)/100, int(recallpwknn*10000)/100, int(recallnpwknn*10000)/100]
    buy_number2 = [int(fscoreknn*10000)/100, int(fscorewknn*10000)/100, int(fscorepknn*10000)/100, int(fscorepwknn*10000)/100, int(fscorenpwknn*10000)/100]
    buy_number3 = [int(gmeanknn*10000)/100, int(gmeanwknn*10000)/100, int(gmeanpknn*10000)/100, int(gmeanpwknn*10000)/100, int(gmeannpwknn*10000)/100]
    name = ['KNN', 'WKNN', 'PTM-KNN', 'PTM-WKNN', 'NEW_PTM-WKNN']
    total_width, n = 2, 3
    width = total_width / n
    x = [0, 2.5, 5, 7.5, 10]
    a = plt.bar(x, buy_number, width=width, label='Recall', fc='y')
    for i in range(len(x)):
        x[i] = x[i] + width
    b = plt.bar(x, buy_number2, width=width, label='F-score', tick_label=name, fc='r')
    for i in range(len(x)):
        x[i] = x[i] + width
    c = plt.bar(x, buy_number3, width=width, label='G-mean', fc='b')
    autolabel(a)
    autolabel(b)
    autolabel(c)
#
    plt.xlabel('算法')
    plt.ylabel('百分比(%)')
    plt.title('实验结果')
    plt.legend()
    plt.show()
コード例 #23
0
    def runPipeline(self, seeDiff=True, k =10):
        filtered = self.data.filter(regex=('_\d')).copy()
        filtered.fillna('NA', inplace=True)
        
        agg_filtered = makeAggregate(filtered)
        
        print("Running pipeline length", len(pipeline))
        #print("Start distribution", getDistribution(agg_filtered))
        
        distributions= []
        for number, imputation in enumerate(pipeline):
            print("Imputation number", number+1)
            print("Pattern: ", imputation[0], "Imputation: ", imputation[1])
            patternMatcher = PatternMatcher(agg_filtered, imputation[0])
            indexes = patternMatcher.returnIndex()
            print(len(indexes), "of pattern found")
            agg_filtered = patternMatcher.imputePattern(imputation[1])

            #print("Distribution of Imputation number", number+1)
            #print('\n',getDistribution(agg_filtered))
            distributions.append(getDistribution(agg_filtered))
        
        
        retVal = unAggregate(agg_filtered, self.data)
        retVal = retVal.replace({'NA':np.nan})

        if (self.run_knn):
            for_knn = retVal
            for_knn = for_knn.filter(regex=('_\d'))
            knn_imputed = KNN(k=k).complete(for_knn)
            knn_imputed = pd.DataFrame(knn_imputed)
            knn_imputed.columns = for_knn.columns
            knn_imputed = knn_imputed.applymap(bar)
            
            diff = set(self.data.columns).difference(set(self.data.filter(regex=('_\d')).columns))
            for i in diff: 
                knn_imputed[i] = self.data[i]
            retVal = knn_imputed
            
        if (seeDiff): 
            return retVal, distributions
        else:
            return retVal
コード例 #24
0
def performKNN(inputDataClass,
               nearestNeighbours,
               mode,
               label_with_distance=False):
    covar = -1
    if mode == 3:
        covar = performanceAnalyser.getFullCovariance(
            inputDataClass.Train[:, :-1])
    knn = KNN.KNN(nearestNeighbours,
                  inputDataClass.Train[:, :-1],
                  inputDataClass.Test[:, :-1],
                  inputDataClass.Train[:, -1],
                  label_with_distance=label_with_distance,
                  mode=mode,
                  covar=covar)
    knn.allocate()
    Ypred = knn.labels
    Ytrue = inputDataClass.Test[:, -1]

    print("Testing Accuracy = " +
          str(performanceAnalyser.calcAccuracyTotal(Ypred, Ytrue)))
コード例 #25
0
ファイル: train.py プロジェクト: Emmonss/Numpy_DL_and_ML
def Main():
    horatio = 0.1
    trainset, trainlabel = ud.get_file_matrix('trainset.txt')

    trainset = np.array(ud.normalized(trainset))

    fiter = int(len(trainlabel) * horatio)

    out = model.KNN(trainset[:fiter],
                    trainset[fiter:],
                    trainlabel[fiter:],
                    k=3)
    acc, precision, recall, F1 = ud.get_Precision_Recall(
        out, trainlabel[:fiter])
    hl = hamming_loss(trainlabel[:fiter], out)
    print("hamming_loss:{}".format(hl))
    print("Acc:{}".format(acc))
    print("Precision:{}".format(precision))
    print("Reacall:{}".format(recall))
    print("F1:{}".format(F1))

    target_names = ['class 0', 'class 1', 'class 2']
    print(classification_report(trainlabel[:fiter], out))
コード例 #26
0
def get_knn_accuracy_(train_imgs, train_class_labels, K_max):
    plt.clf()
    knn = KNN.KNN(train_imgs, train_class_labels)
    distances_to_use = ["euclidean", "cityblock"]
    print("starting knn with next heuristics for distance: ", distances_to_use)
    print("estimated time 6 mins")

    for distance in distances_to_use:
        time.sleep(3)
        percentages_returned = []
        time1 = time.time()

        for i in range(2, K_max):
            labels = knn.predict(test_imgs, i, distance)
            asserted_label_percentaje, equals = get_shape_accuracy(
                labels, test_class_labels)
            plt.clf()
            plt.scatter(test_class_labels.tolist(), labels.tolist())
            plt.title("expected shape VS actual shape with K = " + str(i))
            plt.xlabel("test class labels")
            plt.ylabel("asserted labels")
            plt.grid()
            plt.savefig(output_folder + "formas encontradasK=" + str(i) +
                        distance + ".png")

            percentages_returned.append(asserted_label_percentaje)

        print(distance + " finished in: ", time.time() - time1)

        # Graph
        plt.clf()
        plt.scatter(list(range(2, K_max)), percentages_returned)
        plt.title("KNN % " + distance + " success")
        plt.xlabel("K")
        plt.ylabel("%")
        plt.savefig(output_folder + "porcentaje " + distance + ".png")
コード例 #27
0
    def choix_classifieurs(self, X_train, y_train, X_test, y_test):

        print(
            " \n\t\t--- Recherche des meilleurs classifieurs pour chaque méthode ---\n\n"
        )

        #Choix des classifieurs

        print(" --- Recherche pour Naive Bayes ---\n")
        #Naive Bayes
        nB = nb.NaiveBayes()
        clfNB = nB.choixNB(X_train, y_train, X_test, y_test)

        #Arbre de décision
        print(" --- Recherche pour Arbre de Decision ---\n")
        tree = dt.DecisionTree()
        clfTree, _ = tree.recherche_param(X_train, y_train, X_test, y_test)

        #K plus proches voisins
        print(
            "\n --- Pas de recherche de paramètres pour les K plus proches voisins ---\n"
        )
        kNN = knn.KNN()

        #SVM
        print(" --- Recherche pour la SVM ---\n")
        sVM = svm.SVM()
        clfSVM = sVM.hyperParameter(X_train, y_train)

        #Perceptron
        print(" --- Recherche pour le Perceptron ---\n")
        perceptron = perceptr.Perceptr()
        clfPerceptr = perceptron.rechercheHypParm(X_train, y_train, X_test,
                                                  y_test)

        return (clfNB, clfTree, kNN, clfPerceptr, clfSVM)
コード例 #28
0
    
    print(type(buying[0])) 

    raw_x = list(zip(buying, maint, door, persons, lug_boot, safety)) 
    raw_y = list(cls)

    raw_x = np.array(raw_x, dtype = np.float64)
    raw_y = np.array(raw_y)

    x_train, x_test, y_train, y_test = train_test_split(raw_x, raw_y, random_state= 1, test_size = 0.2)

    # apply KNN
    
    accuracy = {'method' : 'mahattan_distance'}
    for i in range(1,101):
        model = KNN(i)

        model.fit(x_train, y_train, accuracy['method'])
    
    
        predictions = model.predict(x_test)
     
        
        #print("Model evaluation ")
        acc, con_matrix = model.evaluate(y_test, predictions)
        #print("Accuracy: {}".format(acc))
        #print("Confusion matrix")
        #print(con_matrix)
        accuracy.update({i : acc})
import json 
with open("norm1.json", 'w') as file:
コード例 #29
0
'''
import numpy as np
import DT as dt
import KNN as knn


if __name__ == '__main__':

    print 'running tests on DT and KNN'
    #This is the class example [mathy, test >= 80, project >= 80, early]
    #with a slight change so that non-mathy first splits on early.
    trX=np.array([[1,1,1,1],[1,1,1,0],[0,1,0,1],[0,0,1,1],[0,0,1,1],[0,0,0,0],[0,0,0,0],[1,0,1,1],[1,0,0,1],[0,0,1,1],[1,0,0,0],[0,0,1,1],[0,1,0,1],[0,0,1,0]])
    trY=np.array([[1],[1],[0],[0],[0],[1],[0],[1],[0],[0],[0],[0],[0],[1]])
    deX = np.array([[0,1,0,0],[0,0,1,0],[0,1,1,1]])
    deY = np.array([[0],[1],[0]])

    decTree = dt.DT()
    print 'DT, cutoff=0'
    trainModel = decTree.res('train',X=trX,Y=trY,h_param=0)
    decTree.DTdraw(trainModel)
    output = decTree.res('predict',model=trainModel,test_case=deX)
    print output
    
    knnMode = knn.KNN()
    print 'KNN, k=1'
    trainModel = knnMode.res('train',X=trX,Y=trY,h_param=1)
    output = knnMode.res('predict',model=trainModel,test_case=deX)
    print output
    
    print 'Done'
コード例 #30
0
    if dec not in results.keys():
        print '\nNow we vary the cutoff for the decision tree and see how it affects accuracy...'
        thresh = [5,10,20,40,80,160]
        decTree = DT.DT()
        res = run_comps(decTree, thresh, trX[0:4800, :], trY[0:4800], trX[4801:6000, :], 
                    trY[4801:6000],"Figure 2: DT cutoff versus accuracy (MNIST)","DT cutoff","../figure2.png")
        results[dec] = res
        res_file.write('\n' + dec + '\n') 
        res_file.write(str(res))
        raw_input('Press enter to continue...')
 
    neigh = 'knn'+data_types[i]
    if neigh not in results.keys():
        print '\nNow we vary the k for the KNN classifier and see how it affects accuracy...'
        allK = [1,8,16,32,64,128]
        knnModel = KNN.KNN()
        res = run_comps(knnModel, allK, trX[0:2000, :], trY[0:2000], trX[2001:2501, :], 
                     trY[2001:2501],"Figure 3: KNN count versus accuracy (MNIST)","KNN count","../figure3.png")
        results[neigh] = res
        res_file.write('\n' + neigh + '\n')
        res_file.write(str(res))
        raw_input('Press enter to continue...')
           
    heldDT = 'hoDT_'+data_types[i]
    if heldDT not in results.keys():
        print '\nNow we make predictions on dev and test data using the best DT'
        thresh = [5,10,20,40,80,160]
        dtres = 'dt' + data_types[i]
        dtAccs = results[dtres]
        bestDT = np.argmax(dtAccs)
        decTree = DT.DT()