Example #1
0
    def test_knn_classify_basic(self):
        factors = np.array([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1], [-1, -1], [-0.5, -1.1]])
        labels = ['A', 'A', 'B', 'B', 'C', 'C']

        prediction = knn.classify([1.1, 0.9], factors, labels, 3)
        self.failUnless(prediction == 'A')

        prediction = knn.classify([-0.1, 0.3], factors, labels, 3)
        self.failUnless(prediction == 'B')

        prediction = knn.classify([-0.7, -1.3], factors, labels, 3)
        self.failUnless(prediction == 'C')
Example #2
0
def handwriteingClassTest():
    hwLabels = []
    trainingFileList = listdir('trainingDigits')
    m = len(trainingFileList)
    trainingMat = zeros((m,1024))
    
    for i in range(m):
        fileNameStr = trainingFileList[i]
        fileStr = fileNameStr.split('.')[0]
        classNumStr = int(fileStr.split('_')[0])
        hwLabels.append(classNumStr)
        trainingMat[i,:] = img2vector('trainingDigits/%s' % fileNameStr)
    testFileList = listdir('testDigits')
    errorCount = 0.0
    mTest = len(testFileList)
    for i in range(mTest):
        fileNameStr = testFileList[i]
        fileStr = fileNameStr.split('.')[0]
        classNumStr = int(fileStr.split('_')[0])
        vectorUnderTest = img2vector('testDigits/%s' % fileNameStr)
        classifierResult = classify(vectorUnderTest,trainingMat,hwLabels,3)
        
        print("the classifier came back with: %d, the real answer is : %d" % (classifierResult,classNumStr))
        if classifierResult != classNumStr:
            errorCount += 1.0
    
    print("The total number of errors is: %d." % errorCount)
    print("The total error rate is: %f." % (errorCount/float(mTest)))
Example #3
0
def main(dataset_name, testset_name, new_emails=False):
    '''Runs the knn classifier for a training set dataset_name and test set testset_name'''
    current_path = os.path.dirname(os.path.abspath(__file__)) + "\\"
    trainingset_path = current_path + dataset_name + "\\"
    testset_path = current_path + testset_name + "\\"
    results_path = testset_path + "results\\"

    if not os.path.exists(results_path):
        os.mkdir(results_path)

    folder_names = next(os.walk(testset_path + "."))[1]
    if 'results' in folder_names:
        folder_names.remove('results')
    if new_emails:
        folder_names = [""]

    workfilename = 'mergedworkfile.csv'
    wordfilename = 'wordfile.csv'
    # klist = [1, 3, 7, 15, 24, 33, 42, 50]
    klist = [1, 3]
    acc = []
    ks = []
    trainingSet = []

    print("Loading Training Set...")
    wordsd, subd, digramsd, trigramsd = ex.loadTrainingset(
        trainingset_path, workfilename, wordfilename, trainingSet)
    print("Training Set loaded.")

    print('Collecting ' + 'New' * new_emails + 'Test' * (not new_emails) +
          ' Emails...')
    testSet, all_files = ex.loadTestset(testset_path, folder_names, wordsd,
                                        subd, digramsd, trigramsd)
    print('New' * new_emails + 'Test' * (not new_emails) +
          ' Emails Collected.')

    assert (len(trainingSet[0]) == len(testSet[0]))

    list_of_predictions = knn.classify(klist, trainingSet, testSet,
                                       results_path)

    if not new_emails:
        #Finds the predictions and accuracy for new test mails given the predictions for these mails
        for i in range(len(klist)):
            predictions = []
            for x in range(len(testSet)):
                predictions.append(list_of_predictions[x][i])
            accuracy = knn.getAccuracy(testSet, predictions)
            acc.append(accuracy)
            ks.append(klist[i])
            print('K: ' + repr(klist[i]))
            print('Accuracy: ' + repr(accuracy) + '%')

        print('Overall Accuracy: ' + str(sum(acc) / len(acc)) + "%")
        plt.plot(ks, acc)
        plt.xlabel('K')
        plt.ylabel('Accuracy')
        plt.show()

    print('Find the results at: ' + results_path)
Example #4
0
File: test.py Project: reece15/KNN
    def test_classify(self):
        point = (100, 110)  # 未知样本
        dataSet = [(1, 0), (1, 1), (-1, -2), (120, 100), (100, 100),
                   (120, 130)]  # 已知样本数据集
        dataLabel = ['X', 'X', 'X', 'Y', 'Y', 'Y']  #已知样本的分类标签
        res = knn.classify(point, dataSet, dataLabel, 3)

        assert res == "Y"
Example #5
0
File: main.py Project: YuHang0/kNN
def classify_test_data(filename):
    # 对测试集合,单个文件进行识别
    file_path = test_digits_path + filename
    try:
        data = img2vector(file_path)
        res = classify(data, train_dataset, train_labels, 3)
        return int(res)
    except FileNotFoundError:
        print("No such file.")
Example #6
0
def classify_person(percent_game, fly_miles, ice_cream):
    k_value = 3
    labels_str = ['not at all','in small doses', 'in large doses']
    input_point = array([percent_game, fly_miles, ice_cream])
    data_set, labels = get_data_set_from_file("dating.dataset")
    data_set, value_ranges, min_values = knn.auto_normalize(data_set)
    normalized_point = knn.normalize(input_point, value_ranges, min_values)
    result = knn.classify(normalized_point, data_set, labels, k_value)
    return labels_str[result-1]
def main(dataset_name, testset_name, new_emails = False):
	'''Runs the knn classifier for a training set dataset_name and test set testset_name'''
	current_path = os.path.dirname(os.path.abspath(__file__)) + "\\"
	trainingset_path = current_path + dataset_name + "\\"
	testset_path = current_path + testset_name + "\\"
	results_path = testset_path + "results\\"

	if not os.path.exists(results_path):
		os.mkdir(results_path)

	folder_names = next(os.walk(testset_path + "."))[1]
	if 'results' in folder_names:
		folder_names.remove('results')
	if new_emails:
		folder_names = [""]
	
	workfilename = 'mergedworkfile.csv'
	wordfilename = 'wordfile.csv'
	# klist = [1, 3, 7, 15, 24, 33, 42, 50]
	klist = [1, 3]
	acc = []
	ks = []
	trainingSet=[]

	print("Loading Training Set...")
	wordsd, subd, digramsd, trigramsd = ex.loadTrainingset(trainingset_path, workfilename, wordfilename, trainingSet)
	print("Training Set loaded.")

	print('Collecting ' + 'New'*new_emails + 'Test'*(not new_emails) + ' Emails...')
	testSet, all_files = ex.loadTestset(testset_path, folder_names, wordsd, subd, digramsd, trigramsd)
	print('New'*new_emails + 'Test'*(not new_emails) + ' Emails Collected.')

	assert(len(trainingSet[0]) == len(testSet[0]))

	list_of_predictions = knn.classify(klist, trainingSet, testSet, results_path)

	if not new_emails:
		#Finds the predictions and accuracy for new test mails given the predictions for these mails
		for i in range(len(klist)):
			predictions = []
			for x in range(len(testSet)):	
				predictions.append(list_of_predictions[x][i])
			accuracy = knn.getAccuracy(testSet, predictions)
			acc.append(accuracy)
			ks.append(klist[i])
			print('K: ' + repr(klist[i]))
			print('Accuracy: ' + repr(accuracy) + '%')
			
		print('Overall Accuracy: '+ str(sum(acc)/len(acc)) + "%")
		plt.plot(ks, acc)
		plt.xlabel('K')
		plt.ylabel('Accuracy')
		plt.show()
	
	print('Find the results at: ' + results_path)
Example #8
0
def classify(p):

    # return list of feature vectores from image table
    training_data = knn.construct(p)

    kes = [3, 5, 7, 9]

    with open('p_files/test_table_' + seg + '_pc' + str(p) + '.p', 'rb') as f:
        test_table = pickle.load(f)

    max_count = len(test_table)

    for k in kes:
        start_time = time.time()
        print('seg: ' + seg + ' - k : ' + str(k))
        i = 1
        correct = 0
        ROC = {}
        for im_struct in test_table:

            # classify new image from training data
            # get a sorted list of the class id and the number of votes
            label_candidates = knn.classify(training_data, im_struct['feature_vector'], k)

            # label_candidates[0][0] should give the classification
            label = str(int(label_candidates[0][0]))

            im_struct['label_candidates'] = label_candidates
            im_struct['prediction'] = True
            class_id = im_struct['class_id']

            print(label, class_id)
            if class_id == label:
                correct += 1
                ROC = update_ROC(ROC, class_id, tp=True, number=True)
            else:
                ROC = update_ROC(ROC, class_id, number=True)
                ROC = update_ROC(ROC, label, fp=True)
                im_struct['prediction'] = False

            i += 1

        print('accuracy: ', correct/ max_count)
        print('correct: ', correct)

        with open('p_files/test_table_' + seg + '_pc' + str(p) + '_k' + str(k) + '.p', 'wb') as f:
                pickle.dump(test_table, f)

        with open('p_files/ROC_table_' + seg + '_pc' + str(p) + '_k' + str(k) + '.p', 'wb') as f:
            pickle.dump(ROC, f)

        toe = (time.time() - start_time)
        times.append(toe)
Example #9
0
def flores_data_set_run():
    data_set = arff.load(open('./data/flores.arff'))
    test_set, train_set = list_helper.separate_list(data_set['data'], 10)
    results = []

    for x in range(len(test_set)):
        result = knn.classify(train_set, test_set[x][:-1])
        results.append(result)
        print "Real: " + test_set[x][-1] + " - Predicted: " + result

    accuracy = knn.calculate_accuracy(test_set, results)
    print "Accuracy: " + str(accuracy) + "%"
Example #10
0
File: ml.py Project: eadm/ML
def remove_noise(points, classes, metric, kernel, k):
    fds = folds(points, classes, len(points))
    pts, cls = [], []

    for fold in fds:
        cl = knn.classify(fold["train_p"], fold["train_c"], fold["test_p"][0],
                          metric, kernel, k)
        if cl == fold["test_c"][0]:  # not noise
            pts.append(fold["test_p"][0])
            cls.append(cl)

    return np.array(pts), np.array(cls)
Example #11
0
    def test_knn_classify_digits(self):
        test_xs, test_ys = data.create_digit_matrix('digits/testDigits')
        train_xs, train_ys = data.create_digit_matrix('digits/trainingDigits')

        errors = 0
        for index, test_row in enumerate(test_xs):
            predicted_y = knn.classify(test_row, train_xs, train_ys, 3)
            if predicted_y != test_ys[index]:
                errors += 1

        print(errors)
        print(test_xs.shape[0])
        self.failUnless(errors/test_xs.shape[0] < 0.85)
Example #12
0
def datingClassTest():
    ratio=0.1
    k=3
    features,labels= preparation.file2matrix('datingTestSet2.txt')
    features=utils.normalize(features)
    #plotter.scatter(features[:,0],features[:,1],labels)
    numTestVecs=int(ratio*features.shape[0])
    errorCount = 0.0
    for i in range(numTestVecs):
        classifierResult = knn.classify(features[i,:],features[numTestVecs:features.shape[0],:],labels[numTestVecs:features.shape[0]],k)
        print "feature: %s | yhat: %d | y: %d" % (features[i,:],classifierResult,labels[i])
        if (classifierResult != labels[i]): errorCount+=1
    print "Error rate = %f" % (errorCount/numTestVecs)
Example #13
0
def test():
    '''
        run test script
    '''

    dataset, labels = load_dataset()

    le = preprocessing.LabelEncoder()
    encoded_labels = le.fit_transform(labels)

    n_neighbor = classify(dataset, [[1000, 0.5, 340]], encoded_labels)

    print(le.inverse_transform(n_neighbor))
Example #14
0
File: main.py Project: YuHang0/kNN
def classify_test_dataset(k):
    # 对测试集进行识别
    test_num = len(test_dataset)
    # 测试集的数目
    error_num = 0
    # 错误数目
    for data, label in zip(test_dataset, test_labels):
        res = classify(data, train_dataset, train_labels, k)
        # 对测试集进行预测
        if res != label:
            error_num += 1
        # 若预测错误,则计数器加一
    print("total:{},error num:{},error rate:{}".format(test_num, error_num, error_num / test_num))
Example #15
0
def prediccion_data_set_run():
    data_set = arff.load(open('./data/prediccion.arff'))
    cleaned_data_set = [list_helper.unicode_to_int(x) for x in data_set['data']]
    test_set, train_set = list_helper.separate_list(cleaned_data_set, 1)

    results = []

    for x in range(len(test_set)):
        result = knn.classify(train_set, test_set[x][:-1])
        results.append(result)
        print "Real: " + str(test_set[x][-1]) + " - Predicted: " + str(result)

    accuracy = knn.calculate_accuracy(test_set, results)
    print "Accuracy: " + str(accuracy) + "%"
Example #16
0
 def classify(self, sudoku):
   feature_vector = self.feature.feature_vector(sudoku)
   level = feature_vector.pop(0)
   point = dict()
   point['features'] = feature_vector
   point = knn.classify(point, "train_results.txt", 5)
   level = int(point['level'])
   sgl = learning.StochasticGradientLearner(learning.basicFeatureExtractor)
   feature_vector.insert(0, level)
   sgl_level = sgl.predict_one(feature_vector)
   weighted_level = 2*level + 3*sgl_level
   weighted_level = weighted_level/5
   if level == 1 or sgl_level == 1: 
     return 1
   return weighted_level
Example #17
0
def classify_gui(k):
    """
    :param k: k值
    :return:人群分类
    """
    data_mat, class_label_vector = file_to_matrix('../data/dating_test_set_2.txt')
    fly_distances = float(input("请输入飞行里程数:"))
    icecream = float(input("请输入消耗冰淇淋公升数:"))
    play_time = float(input("请输入玩游戏花费时间百分比:"))
    norm_data_set, ranges, min_vals = auto_norm(data_mat)
    data_person=np.array([fly_distances,icecream,play_time])
    norm_person_data=(data_person-min_vals)/ranges
    class_person=['不喜欢','一般','极具魅力']
    label_person=classify(norm_person_data,norm_data_set,class_label_vector,k)
    return class_person[label_person-1]
Example #18
0
def datingClassTest():
    ratio = 0.1
    k = 3
    features, labels = preparation.file2matrix('datingTestSet2.txt')
    features = utils.normalize(features)
    #plotter.scatter(features[:,0],features[:,1],labels)
    numTestVecs = int(ratio * features.shape[0])
    errorCount = 0.0
    for i in range(numTestVecs):
        classifierResult = knn.classify(
            features[i, :], features[numTestVecs:features.shape[0], :],
            labels[numTestVecs:features.shape[0]], k)
        print "feature: %s | yhat: %d | y: %d" % (features[i, :],
                                                  classifierResult, labels[i])
        if (classifierResult != labels[i]): errorCount += 1
    print "Error rate = %f" % (errorCount / numTestVecs)
Example #19
0
def classify_person():
    """
    对给定的数据进行人群分类判断
    :return:
    """
    #定义人群分类:[0,1,2]
    ff_miles=float(input("每年飞行常客里程数:"))
    ice_cream=float(input('每周消耗的冰淇淋公升数:'))
    percent_game=float(input('玩游戏所消耗的时间百分比:'))
    data_mat,class_label_vector=file_to_matrix('../data/dating_test_set_2.txt')
    norm_dating_data_set,ranges,min_vals=auto_norm(data_mat)
    in_x=np.array([ff_miles,ice_cream,percent_game])   #待验证数据

    norm_in_x=(in_x-min_vals)/(ranges)
    classify_result=classify(norm_in_x,norm_dating_data_set,class_label_vector,3)
    return classify_result
Example #20
0
    def test_knn_classify_dating(self):
        xs, ys = data.create_dating_set()
        xs, mins, ranges = knn.normalize_cols(xs)

        n = xs.shape[0]
        test_n = int(n * 0.1)

        errors = 0
        for i in range(test_n):
            predicted_y = knn.classify(xs[i], xs[test_n:n, :], ys[test_n:n], 5)
            if predicted_y != ys[i]:
                errors += 1

        print(errors)
        print(test_n)
        self.failUnless(errors/float(test_n) < 0.85)
Example #21
0
def dating_class_test():
    """
    应用测试集测试分类机的错误率
    :return: None
    """
    hold_out_ratio = 0.10  #拿出作为测试集的数据比例
    data_set,labels = file_to_matrix('./data/dating_test_set_2.txt')
    norm_data_set,ranges,min_vals=auto_norm(data_set)
    size=norm_data_set.shape[0]  #获得数据集行数
    num_test_size = int(size * hold_out_ratio)   #保留行数
    error_count = 0.0  #错误统计
    for i in range(num_test_size):
        classifier_result=classify(norm_data_set[i,:],norm_data_set[num_test_size:size],labels[num_test_size:size],5)
        print('分类器返回:%d, 真实答案为:%d'% (classifier_result,labels[i]))
        if classifier_result!= labels[i]:
            error_count+=1.0
    print('分类器错误率为:%0.2f%%' % (error_count/(float(num_test_size))*100))
Example #22
0
def run():
    classificadores = ["Bayesian", "KNN 1", "KNN 5", "KNN 10", "KNN 20", "KNN 30", "Sum 1", "Sum 5", "Sum 10", "Sum 20", "Sum 30"]
    errorResults = {}

    for c in classificadores:
        errorResults[c] = []

    K = [30, 20, 10, 5, 1]

    resultsBay = open('part2-results-bayesian.txt', 'a')
    resultsKn = open('part2-results-knn.txt', 'a')
    resultsSum = open('part2-results-sum.txt', 'a')

    for j in range(10):
        H = data_proccessing.loadData() # Folds

        for i in range(10):
            resultsBay.write("Round %d\n\n" % (j*10+i+1))
            resultsKn.write("Round %d\n\n" % (j*10+i+1))
            resultsSum.write("Round %d\n\n" % (j*10+i+1))

            (trainX, trainY, testX, testY) = prepareSets(H, i)
            (P_bay, E_bay, e_rate_bay, se_bay, interval_bay) = bayesian.classify(trainX, trainY, testX, testY)

            resultsBay.write("- Bayesian\n")
            writeResults(resultsBay, e_rate_bay, se_bay, interval_bay)
            errorResults["Bayesian"].append(e_rate_bay)

            for k in K:
                (P_kn, E_kn, e_rate_kn, se_kn, interval_kn) = knn.classify(trainX, trainY, testX, testY, k)
                (P_sum, E_sum, e_rate_sum, se_sum, interval_sum) = sum_rule.classify([P_bay, P_kn], testX, testY)
                #
                resultsKn.write("- KNN (n = %d)\n" % k)
                writeResults(resultsKn, e_rate_kn, se_kn, interval_kn)
                errorResults["KNN %i" % k].append(e_rate_kn)
                #
                resultsSum.write("- Sum (n = %d)\n" % k)
                writeResults(resultsSum, e_rate_sum, se_sum, interval_sum)
                errorResults["Sum %i" % k].append(e_rate_sum)

    resultsBay.close()
    resultsKn.close()
    resultsSum.close()

    compare(errorResults)
Example #23
0
def test():
    '''
        knn test of dating data
    '''

    # import dataset, normalized dataset and class labels for dataset
    dset, normalizing, labeling = process_data('datingTestSet.txt')

    # normalized dataset, ranges, minimum values
    # and maximum values from dataset
    norm_dset, ranges, min_vals, max_vals = normalizing

    # label indices to match labels for sample in dataset
    # against class labels key and class labels key
    label_indices, labels_key = labeling

    # use 10 percent of training data as test data
    ho_ratio = 0.10

    # m is number of samples in dataset
    m = norm_dset.shape[0]

    # number of test samples
    num_tests = int(m * ho_ratio)

    # loop over all test samples and compare known labels versus alogrithm
    # classification and print out error rate
    error_count = 0.0
    for i in range(num_tests):
        # normalize test sample
        norm_test = (dset[i, :] - min_vals) / ranges

        # classify test sample
        classification = classify(norm_test, norm_dset[num_tests:m, :],
                                    label_indices[num_tests:], 3)

        print('classifier answer: {}, real answer: {}'.format(
            labels_key[classification], labels_key[label_indices[i]]))

        # compare known label to classifier label
        if labels_key[classification] != labels_key[label_indices[i]]:
            error_count += 1.0

    print('total error rate: {}'.format(error_count / float(num_tests)))
Example #24
0
def classify_number_test(data_set_path, test_data_path, k, ratio=1):
    """
    应用knn识别数字,并计算正确率
    :param data_set_path: 数据集
    :param test_data_path: 测试集
    :param k: k值
    :return: None
    """
    start = time.clock()  #设置时间点
    data_set, lables = file2Matrix(data_set_path, ratio)  #读取训练数据集
    data_set_test, lables_test = file2Matrix(test_data_path)  #读取测试数据集
    true_count = 0

    for i in range(data_set_test.shape[0]):
        result_lable = classify(data_set_test[i], data_set, lables,
                                k)  #调用kNN分类器返回分类结果
        # print('分类器返回结果为:%d,真实结果为:%d' % (result_lable, lables_test[i]))
        if result_lable == lables_test[i]:
            true_count = true_count + 1
    end = time.clock()
    print('k值为:%d,正确率为:%0.2f%%,耗时:%0.6f' %
          (k, (true_count / data_set_test.shape[0] * 100), (end - start)))
Example #25
0
def hand_writing_class_test():
    """
    测试识别手写数字分类正确率
    :return: None
    """

    #第一步,创建训练集内容
    hw_labels = []
    training_file_list = listdir('../data/digits/training_digits')  #获取目录内容
    m = len(training_file_list)
    training_data_set = np.zeros((m, 1024))  #利用行数创建训练集集合
    for i in range(m):
        filename_str = training_file_list[i]
        # file_str = filename_str.split('.')[0]  #将文件名截取
        # class_num_str = int(file_str.split('_')[0])
        class_num_str = int(filename_str.split('_')[0])
        hw_labels.append(class_num_str)  #将分类添加至分类向量

        img_vector = img_to_vector('../data/digits/training_digits/%s' %
                                   filename_str)
        training_data_set[i, :] = img_vector
        # print(training_data_set[i])
    #第二步,创建测试集
    test_file_list = listdir('../data/digits/test_digits')
    m_test = len(test_file_list)
    error_count = 0.0
    for i in range(m_test):
        filename_str = test_file_list[i]

        class_num_str = int(filename_str.split('_')[0])
        # print(class_num_str)
        test_img_vector = img_to_vector('../data/digits/test_digits/%s' %
                                        filename_str)
        # print((training_data_set[i]-test_img_vector).sum())
        # print(class_num_str)
        #创建一个5NN分类模型
        classifier_result = classify(test_img_vector, training_data_set,
                                     hw_labels, 5)
        print(classifier_result)
Example #26
0
def cross_validate(train_data, train_labels, k, distance, F=5, prints=True):
    """
	Performs f-fold cross validation on the specified training set. Returns an array storing
	all cross-validation accuracies
	"""
    # number of training instances in each cross-validation subset
    C = train_data.shape[0] // F
    # initialize empty array to store cross-validation accuracies
    accuracy = np.zeros(F)
    # for each round of cross-validation
    for f in range(F):
        # create indices for the validation set
        validation_index = np.arange(f * C, (f + 1) * C)
        # create indeices for the training set
        train_index = np.setdiff1d(np.arange(0, train_data.shape[0]),
                                   validation_index)
        # obtain predicted labels for the images in the validation set
        predicted_labels = knn.classify(train_data[train_index],
                                        train_labels[train_index],
                                        train_data[validation_index], k,
                                        distance)
        # compute confusion matrix for validation set
        con_matrix = knn.confusion_matrix(train_labels[validation_index],
                                          predicted_labels)
        # convert to pandas data frame to label rows and columns, then print
        # suppressed when performing cross-validation for multiple values of k
        if prints:
            con_mat_df = pd.DataFrame(con_matrix,
                                      index=['1', '2', '7'],
                                      columns=['1', '2', '7'])
            print('Cross-validation round', f + 1)
            print(
                'Confusion matrix: Predicted classes along horizontal axis. Actual classes along vertical axis.'
            )
            print(con_mat_df)
        # compute and store cross-validation accuracy
        accuracy[f] = knn.accuracy(con_matrix)
    return accuracy
Example #27
0
def test():
    '''
        run test script
    '''

    train_dir = getcwd() + '/datasets/hw/trainingDigits/'
    train_dataset, train_filenames = build_dataset(train_dir)
    train_labels = labels_from_filenames(train_filenames)
    train_vectors = to_vectors(train_dataset)

    test_dir = getcwd() + '/datasets/hw/testDigits/'
    test_dataset, test_filenames = build_dataset(test_dir)
    test_labels = labels_from_filenames(test_filenames)
    test_vectors = to_vectors(test_dataset)

    n_neighbors = classify(train_vectors, test_vectors, train_labels)

    err_count = 0
    for idx, prediction in enumerate(n_neighbors):
        if prediction != test_labels[idx]:
            err_count += 1

    print('error rate: {}'.format(err_count / float(len(test_vectors))))
Example #28
0
def test():
    '''
        get user input for test dating data
    '''
    dset, normalizing, labeling = process_data('datingTestSet.txt')
    norm_set, ranges, min_vals, max_vals = normalizing
    label_indices, label_keys = labeling

    # classes key
    class_names = ['not at all', 'in small doses', 'in large doses']

    # user input -- sample for testing
    gaming = float(input('percent time playing video games?'))
    flyerMiles = float(input('frequent flyer miles earned each year?'))
    iceCream = float(input('liters of ice cream consumed per year?'))
    user_input = array([flyerMiles, gaming, iceCream])

    # normalize test sample
    norm_test = (user_input - min_vals) / ranges

    # classify user input
    classifications = classify(norm_test, norm_set, label_indices, 3)
    print('You will probably like this person: ',
          class_names[classifications - 1])
Example #29
0
def main():
    technique = sys.argv[3]

    if (technique in ("nearest", "best")):
        if technique == "best":
            print "We are getting best accuracy for KNN, and adaboost is closely behind it..."
        k = 201
        print "inside KNN , this takes upto to 15  minutes to show the output"
        train = sys.argv[1]
        test = sys.argv[2]
        # (train, test, technique) = ("train-data-mod.txt", "test-data-mod.txt", "nearest")
        '''KNN is simple but complexity is the problem - if number of sample data is huge then the algorithm
        has a huge complexity'''
        result = knn.classify(train, test, k)
        confusionMatrix(result)
        print "knn_output.txt is created. You can see our predictions there"
    if (technique == "nnet"):
        input_data, class_labels, names = read_data(sys.argv[1])
        cd = {'0': 0, '90': 1, '180': 2, '270': 3}
        weights_one, weights_two = nnets.train(input_data, class_labels,
                                               int(sys.argv[-1]))
        output_data, oclass_labels, test_names = read_data(sys.argv[2])
        pred = feed_forward(input_data, weights_one, weights_two)
        print "Train Accuracy is", accuracy(pred, class_labels)
        test_pred = feed_forward(output_data, weights_one, weights_two)

        print "Test Accuracy is", accuracy(test_pred, oclass_labels)
        cf_list = write_to_file(test_pred, oclass_labels, test_names)
        confusionMatrix(cf_list)
        print "nnet_output.txt is created. You can see our predictions there"
        #print cf_list[:10]
    # print time.time()-start
    if technique == "adaboost":
        print "hii... this would take around 1 minute to run"
        adaboost.main(sys.argv[1], sys.argv[2], int(sys.argv[4]))
        print "adaboost_output.txt is created. You can see our predictions there"
Example #30
0
 def test_action_recognition(self):
     actual = knn.classify((0, 155), self.training_set)
     self.assertEqual("action", actual)
Example #31
0
 def test_romance_recognition(self):
     actual = knn.classify((180, 8), self.training_set)
     self.assertEqual("romance", actual)
Example #32
0
def recog(vec,ts):
    return knn.classify(vec, ts.mat, ts.labels, 3)
Example #33
0
def multiPack(mix_args):
    return classify(mix_args[0], mix_args[1], mix_args[2], mix_args[3]) == mix_args[4]
Example #34
0
from knn import classify
from utils import load
import numpy as np
from tqdm import tqdm
import datetime

train_images, train_labels, test_images, test_labels = load()
# 调整训练集大小
train_images, train_labels = train_images[:10000], train_labels[:10000]
# 取前100快一点!
test_images, test_labels = test_images[:100], test_labels[:100]
k = 5
print("读取完成")
test_size = test_images.shape[0]
print("正在分类:{}".format(test_size))
timeA = datetime.datetime.now()
outY = [
    classify(test_images[i], train_images, train_labels, k)
    for i in tqdm(range(test_size))
]
print(datetime.datetime.now() - timeA)

# print("KNN输出:{}".format(outY))
# print("实际标签:{}".format(test_labels))
acc = np.mean(outY == test_labels)
print('\n\n')
print("正确率:{}".format(acc))
# 正确率:0.9683
Example #35
0
#!/user/bin/env python
#-*- coding:utf-8 -*-
import knn
from numpy import *
dataset, labels = knn.creatDataSet()
input = array([1.1, 0.3])
K = 3
output = knn.classify(input, dataset, labels, K)
print("测试数据为:", input, "分类结果为:", output)
Example #36
0
#!/usr/bin/python
#coding = utf-8

from numpy import *
import importData
import plot
import knn

dataSet_tra, label_tra = importData.dataFrmFile('./optdigits.tra')
dataSet_tes, label_tes = importData.dataFrmFile('./optdigits.tes')
index = -1
cnt = 0
for i in label_tes:
    index += 1
    if i != knn.classify(dataSet_tes[index], dataSet_tra, label_tra, 10):
        cnt += 1

print cnt
plot.plot(dataSet_tes, label_tes, 4, 5)

Example #37
0
import knn
from numpy import *

dataset, labels = knn.createDataset()
print(dataset)
print(labels)
predict = array([[1,1.1]])
result = knn.classify(predict, dataset, labels, 3)
print (result)