Beispiel #1
0
def run_knn(train_in, train_targ, valid_in, valid_targ, test_in, test_targ):
    '''
    Run k-NN, where k ranges over K_VALUES and plot performance.
    '''
    train_rates = []
    valid_rates = []
    test_rates = []
    for k in K_VALUES:
        print "Running {}-NN.".format(k)
        train_prediction = knn(train_in, train_targ, train_in, k)
        valid_prediction = knn(train_in, train_targ, valid_in, k)
        test_prediction = knn(train_in, train_targ, test_in, k)

        train_rates += [classification_rate(train_prediction, train_targ)]
        valid_rates += [classification_rate(valid_prediction, valid_targ)]
        test_rates += [classification_rate(test_prediction, test_targ)]
        
        print "TRAINING CLASSIFICATION RATE = {}".format(train_rates[k-1])
        print "VALIDATION CLASSIFICATION RATE = {}".format(valid_rates[k-1])
        print "TEST CLASSIFICATION RATE = {}".format(test_rates[k-1])

    best_k = K_VALUES[np.argmax(valid_rates)]
    print "MOST ACCURATE MODEL : {}-NN".format(best_k)
    test_prediction = knn(train_in, train_targ, test_in, best_k)
    test_classification = classification_rate(test_prediction, test_targ)
    print "TEST CLASSIFICATION RATE = {}".format(test_classification)
    
    plt.title("Classification of Actors using k-NN")
    plt.xlabel("k")
    plt.axis([np.min(K_VALUES), np.max(K_VALUES), 0, 100])
    plt.grid(True)
    plt.ylabel("Training Classification Rate")
    plt.plot(K_VALUES, train_rates, 'go')
    plt.show()

    plt.title("Classification of Actors using k-NN")
    plt.xlabel("k")
    plt.axis([np.min(K_VALUES), np.max(K_VALUES), 0, 100])
    plt.ylabel("Validation Classification Rate")
    plt.plot(K_VALUES, valid_rates, 'ro')
    plt.show()

    plt.title("Classification of Actors using k-NN")
    plt.xlabel("k")
    plt.axis([np.min(K_VALUES), np.max(K_VALUES), 0, 100])
    plt.grid(True)
    plt.ylabel("Test Classification Rate")
    plt.plot(K_VALUES, test_rates, 'bo')
    plt.show()
Beispiel #2
0
def uxval(start,stop,data,rows,f,z,m,k,a):
    rmax = len(rows)
    test = []
    temp = ""
    makeTable(colname[z],"train")
    for r in range(0, rmax):
        d = rows[r]
        if r >= start and r < stop:
            test.append(d)
        else:
            addRow(d,"train")
    #zeror(test, data, hypotheses, z) 
    #xvalTest1(test,data,hypotheses)
    #nb(test,data,hypotheses,z,k,m)
    knn(test,data,"train",a,k)
Beispiel #3
0
def findKNN(Q: list, tree: QuadTree, K: int, datasetDict: dict):
    res = []
    for qi in Q:
        #查找最临近的K个点
        resi = knn(datasetDict[qi], K, tree, datasetDict)
        res.append(resi)
    return res
Beispiel #4
0
 def fit(self):
     for i in range(self.labels_num):
         y = 0
         for j in range(self.train_data_num):
             if self.train_target[j][i] == 1:
                 y = y + 1
         self.Ph1[i] = (self.s + y)/(self.s*2 + self.train_data_num)
     self.Ph0 = 1 - self.Ph1
                
     for i in range(self.labels_num):
         c1 = np.zeros((self.k + 1,))
         c0 = np.zeros((self.k + 1,))
         for j in range(self.train_data_num):
             temp = 0
             KNN = knn(self.train_data, j, self.k)
             for k in range(self.k):
                 if self.train_target[int(KNN[k])][i] == 1:
                     temp = temp + 1
             if self.train_target[j][i] == 1:
                 c1[temp] = c1[temp] + 1
             else:
                 c0[temp] = c0[temp] + 1
         
         for l in range(self.k + 1):
             self.Peh1[i][l] = (self.s + c1[l])/(self.s*(self.k + 1) + c1.sum())
             self.Peh0[i][l] = (self.s + c0[l])/(self.s*(self.k + 1) + c0.sum())
def main():
    testing = transform(pd.read_csv('Bankruptcy/testing_zScore.csv', header=None))
    training = transform(pd.read_csv('Bankruptcy/training_zScore.csv', header=None))
    between = find_between_var(testing)
    within = find_within_var(testing)
    w = np.real(projector(between, within))

    for data in testing:
        data[1] = np.dot(np.array(data[1]), w)
    for data in training:
        data[1] = np.dot(np.array(data[1]), w)

    knn(1, testing, training)
    knn(3, testing, training)
    knn(5, testing, training)
    knn(7, testing, training)
Beispiel #6
0
def main():

    #Kmeans
    kmeans_machine = kmeans(sample, k)
    kmeans_machine.train()

    #KNN
    knn_machine = knn(test, sample, target, k)
    print knn_machine.train()

    #Decision tree

    #SVM
    svm_run(Dataset, Trainset)
Beispiel #7
0
 def run(self):
     self.read_input()
     algo = self.options['algo']
     if algo == 'knn':
         classifier = knn(self.doc)
     elif algo == 'dtree':
         classifier = dtree(self.doc)
     elif algo == 'bnb':
         classifier = bnb(self.doc)
     elif algo == 'gnb':
         classifier = gnb(self.doc)
     elif algo == 'mnb':
         classifier = mnb(self.doc)
     classifier.evaluate()
Beispiel #8
0
def crossValidation(data, n):
    ac = 0
    confusion = {}
    for a in range(len(data)):
        test = data.pop(0)
        model = knn(data, n)
        result = model.classify(test['Data'])
        if(result == test['Type']):
            ac+=1
        if test['Type'] not in confusion.keys():
            confusion[test['Type']] = {}
        if result not in confusion[test['Type']].keys():
            confusion[test['Type']][result] = 0
        confusion[test['Type']][result] += 1
        data.append(test)
        confusion.keys()
    ac /= len(data)
    print("Acuracia: " + str(ac*100) + "%")
    for i in confusion.keys():
        print(i, end = '')
        print(confusion[i])
Beispiel #9
0
from scipy.io import loadmat

from naive_bayes import *
from knn import *
from linear_regression import *

accuracy = lambda pred, actual: sum(pred.ravel() == actual.ravel()) / len(pred)

XTrain = loadmat('./data/Iris/XTrainIris.mat')['XTrain']
yTrain = loadmat('./data/Iris/yTrainIris.mat')['yTrain']
XTest = loadmat('./data/Iris/XTestIris.mat')['XTest']
yTest = loadmat('./data/Iris/yTestIris.mat')['yTest']

yTestPred = knn( XTrain, yTrain, XTest )
print('knn:', accuracy(yTestPred, yTest))

yTestPred = naiveBayesClassify( XTrain, yTrain, XTest )
print('NB:', accuracy(yTestPred, yTest))

XTrain = loadmat('./data/WBC/XTrainWBC.mat')['XTrain']
yTrain = loadmat('./data/WBC/yTrainWBC.mat')['yTrain']
XTest = loadmat('./data/WBC/XTestWBC.mat')['XTest']
yTest = loadmat('./data/WBC/yTestWBC.mat')['yTest']

yTestPred = knn( XTrain, yTrain, XTest )
print('knn:', accuracy(yTestPred, yTest))

yTestPred = naiveBayesClassify( XTrain, yTrain, XTest )
print('NB:', accuracy(yTestPred, yTest))

XTrain = loadmat('./data/Challenge/XTrain.mat')['XTrain']
Beispiel #10
0
def _nb1():
    with settings(LIB, seed=2), settings(TABLE, era=3):
        knn("weather.csv")
    data = pd.read_csv("Movies_training_classification.csv")
    X_train, Y_train, x_test, y_test = Classi_PreProcessing(data)
    ch = int(input("Please choose a classifer 1-Logestic regression 2-KNN 3-SVM : "))

    if ch == 1:
        try:
             load(x_test,y_test)
        except (OSError, IOError) as e:
            #model = logistic_regression(X_train,Y_train,x_test,y_test)
            model = logistic_regression(X_train,Y_train)
            load(x_test,y_test)
    elif ch == 2:
        try:
            load(x_test,y_test)
        except (OSError, IOError) as e:
            model = knn(X_train, Y_train)
            load(x_test,y_test)
            
    else:
        try:
            load(x_test,y_test)
        except (OSError, IOError) as e:
            model = SVM(X_train, Y_train)
            load(x_test,y_test)
  
   
    

if choice == 2:
    
    data = pd.read_csv("Movies_training.csv")
                    help='test_num for knn')

if __name__ == '__main__':
    args = parser.parse_args()
    if args.exp_name == 'knn':
        args.split = 'both'

    # load data
    if args.dataset == 'mnist':
        data = Mnist(args)
    else:
        data = Mnist(args)

    # perform experiment
    if args.exp_name == 'perceptron':
        X, y = extract_1_5(data.X, data.y)

        # normalize to [-1, 1]
        lower_bound = -1 * np.ones(X.shape)
        X = lower_bound + 2 * (X / (X.max() - X.min()))

        perceptron(X, y, args)

    elif args.exp_name == 'knn':
        acc = [0 for _ in range(args.k_range)]
        # perform knn and "k" search
        for i in range(args.k_range):
            acc[i] = knn(data, args)
            args.k = args.k + 1
        draw_search(acc, args)
Beispiel #13
0
visualizeIdx = [tokens[word] for word in visualizeWords]
visualizeVecs = wordVectors[visualizeIdx, :]
temp = (visualizeVecs - np.mean(visualizeVecs, axis=0))
covariance = 1.0 / len(visualizeIdx) * temp.T.dot(temp)
U, S, V = np.linalg.svd(covariance)
coord = temp.dot(U[:, 0:2])

for i in range(len(visualizeWords)):
    plt.text(coord[i, 0],
             coord[i, 1],
             visualizeWords[i],
             bbox=dict(facecolor='green', alpha=0.1))

plt.xlim((np.min(coord[:, 0]), np.max(coord[:, 0])))
plt.ylim((np.min(coord[:, 1]), np.max(coord[:, 1])))

plt.savefig('word_vectors.png')

test_words = [
    "rain", "snow", "cool", "bad", "worth", "coffee", "tea", "man", "queen",
    "hail"
]

inputVectors = wordVectors[:nWords]
inv_tokens = {v: k for k, v in tokens.items()}
for test_word in test_words:
    wordVector = inputVectors[tokens[test_word]]
    pred_idx = knn(wordVector, inputVectors, 10)
    print("The similar word \"" + test_word + "\": ",
          [inv_tokens[i] for i in pred_idx])
Beispiel #14
0
from knn import *
from sklearn.model_selection import train_test_split

df = load_data_set('data/PhishingData.arff')
trainSet, testSet = train_test_split(df,
                                     test_size=0.2,
                                     random_state=42,
                                     shuffle=False)
trainSet = trainSet.values.tolist()
testSet = testSet.values.tolist()

for k in range(2, 33):
    knn_clf = knn(k)
    knn_clf.fit(trainSet, trainSet)
    hyp = knn_clf.predict(testSet)
    score = get_accuracy(testSet, hyp)
    print("k=", k, 'Score=', score)
import knn
model = knn('datingTestSet')
model.train()
x=[1,1,1];k=3
res=model.test(x,k)


Beispiel #16
0
from knn import *

LOOPS_COUNT = 10
CLUSTERS_COUNT = 16

im = Image.open('lenna.png')

without_alpha = lambda x: x[:-1]
arr = list(map(without_alpha, im.getdata()))

centers, colors = knn(arr, LOOPS_COUNT, CLUSTERS_COUNT)

res_coords = fill_with_centers(arr, centers, colors)

print(res_coords.shape)
print(np.array(colors).shape)


res_img = Image.fromarray(res_coords)
res_img.show()
res_img.save('result_png.png')
Beispiel #17
0
def _knn1():
    knn('data/diabetes.arff', 'data/diabetes.arff').report()
Beispiel #18
0
from knn import *
from sampling import *
import pandas as pd

datasetwl = pd.read_csv(".\Datasets\dataset_data_mining_course.csv",
                        header=None)
datasetwl = np.array(datasetwl)
dataset = datasetwl[:, 0:2]
labels = datasetwl[:, 2]
seeds = [[5.0, 5.0], [-10.0, -10.0]]
seedlabels = [0, 1]
# 样本本身的分布
show_clusters_with_label(dataset, labels)
# 随机采样
random_samples, random_sampleslabels = simple_random_sampling(dataset, labels)
show_clusters_with_label(random_samples, random_sampleslabels)
knn(10, dataset, random_samples, random_sampleslabels, labels=labels)
# 新的采样方法(采样过程较慢)
samples, sampleslabels = new_sampling(dataset, seeds, labels, seedlabels)
show_clusters_with_label(samples, sampleslabels)
knn(5, dataset, samples, sampleslabels, labels=labels)
Beispiel #19
0
def test_8020(T,
              k,
              normalized=True,
              debug=False,
              dataset_title='',
              num_tests=1):
    fig, ax = plt.subplots()
    ax.set(xlabel='Test Run #',
           ylabel='Error Rate (%)',
           title='5-NN Performance for ' + dataset_title)
    ax.grid()

    if normalized:
        T = normalize(T)

    UW_error_rates = []
    W_error_rates = []

    # make the run look sick
    pbar = ProgressBar()

    for simulation_x in pbar(range(num_tests)):
        # randomize testing and training set
        (T, sample_size) = stratify(T)

        # say sample_size on first go
        if simulation_x == 0:
            print(F'Testing {sample_size} examples')
            print(F'Running {num_tests} simulations')

        test_set = T[:sample_size + 1]
        training_set = T[sample_size + 1:]

        unweighted_errors = 0
        weighted_errors = 0
        for x in test_set:
            # separate input and class for readability
            x_input = x[:-1]
            actual_class = x[-1]

            if debug:
                print('-' * 20)

            result_unweighted = knn(training_set,
                                    x_input,
                                    k,
                                    weighted=False,
                                    debug=debug)
            result_weighted = knn(training_set,
                                  x_input,
                                  k,
                                  weighted=True,
                                  debug=debug)

            if result_unweighted != actual_class:
                if debug:
                    print(
                        F'KNN unweighted classified X as {result_unweighted} when it should be {actual_class}'
                    )
                unweighted_errors += 1

            if result_weighted != actual_class:
                if debug:
                    print(
                        F'KNN weighted classified X as {result_weighted} when it should be {actual_class}'
                    )
                weighted_errors += 1

            if debug:
                print('-' * 20)

        UW_error_rates.append(100 * unweighted_errors / sample_size)
        W_error_rates.append(100 * weighted_errors / sample_size)

    average_UW_error_rate = sum(UW_error_rates) / num_tests
    average_W_error_rate = sum(W_error_rates) / num_tests

    print(
        F'Average error rate for unweighted knn is {average_UW_error_rate:.6f}'
    )
    print(F'Average error rate for weighted knn is {average_W_error_rate:.6f}')

    # add some extra space between runs
    print('\n')

    # plot uweighted and weighted error rates
    ax.plot([x + 1 for x in range(num_tests)],
            UW_error_rates,
            label='unweighted error rates',
            alpha=0.5)
    ax.plot([x + 1 for x in range(num_tests)],
            W_error_rates,
            label='weighted error rates',
            alpha=0.7)
    ax.legend()
    plt.show()
Beispiel #20
0
            print 'Data Train: ', len(train_idx)
            print 'Data Test: ', len(test_idx)

            i = i + 1
            X_train = dataset.data[train_idx]
            X_test = dataset.data[test_idx]
            y_train = dataset.target[train_idx]
            y_test = dataset.target[test_idx]

            row = []
            row_f = []

            # classifier
            if (classifier[0]):
                e_knn, f_knn = knn(k, dataset.data[train_idx],
                                   dataset.data[test_idx],
                                   dataset.target[train_idx],
                                   dataset.target[test_idx])
                err_knn = err_knn + [e_knn]
                row = row + [e_knn]

                f1_knn = f1_knn + [f_knn]
                row_f = row_f + [f_knn]
                print 'knn finished'

            if (classifier[1]):
                e_wknn, f_wknn = wknn(k, dataset.data[train_idx],
                                      dataset.data[test_idx],
                                      dataset.target[train_idx],
                                      dataset.target[test_idx])
                err_wknn = err_wknn + [e_wknn]
                row = row + [e_wknn]
Beispiel #21
0
visualizeWords = [
    "great", "cool", "brilliant", "wonderful", "well", "amazing", "worth",
    "sweet", "enjoyable", "boring", "bad", "dumb", "annoying", "female",
    "male", "queen", "king", "man", "woman", "rain", "snow", "hail", "coffee",
    "tea"
]

visualizeIdx = [tokens[word] for word in visualizeWords]
visualizeVecs = wordVectors[visualizeIdx, :]

temp = (visualizeVecs - np.mean(visualizeVecs, axis=0))
covariance = 1.0 / len(visualizeIdx) * temp.T.dot(temp)
U, S, V = np.linalg.svd(covariance)
coord = temp.dot(U[:, 0:2])

for i in range(len(visualizeWords)):
    plt.text(coord[i, 0],
             coord[i, 1],
             visualizeWords[i],
             bbox=dict(facecolor='green', alpha=0.1))

plt.xlim((np.min(coord[:, 0]), np.max(coord[:, 0])))
plt.ylim((np.min(coord[:, 1]), np.max(coord[:, 1])))

plt.savefig('word_vectors.png')

knn_matrix = visualizeVecs
knn_vector = wordVectors[0]
near_idx = knn(knn_vector, knn_matrix, 5)
nearest = [visualizeWords[i] for i in near_idx]
print("the nearest words to", visualizeWords[0], "are", nearest)
Beispiel #22
0
def generateQuery(K: int, tree: QuadTree, datasetDict: dict) -> list:
    begin = random.randint(0, tree.root.pointsNum - 1)
    #找最临近的另外两个点 加上初始点 作为查询点集合
    res = knn(datasetDict[begin], K, tree, datasetDict)
    return res
Beispiel #23
0
def _nb1():
  with settings(LIB,seed=2),settings(TABLE,era=3):
    knn("weather.csv")
Beispiel #24
0
def _knn():
    knn('data/weather.arff', 'data/weather.arff').report()
    "enjoyable", "boring", "bad", "waste", "dumb", "annoying"
]

visualizeIdx = [tokens[word] for word in visualizeWords]
visualizeVecs = wordVectors[visualizeIdx, :]
temp = (visualizeVecs - np.mean(visualizeVecs, axis=0))
covariance = 1.0 / len(visualizeIdx) * temp.T.dot(temp)
U, S, V = np.linalg.svd(covariance)
coord = temp.dot(U[:, 0:2])

for i in xrange(len(visualizeWords)):
    plt.text(coord[i, 0],
             coord[i, 1],
             visualizeWords[i],
             bbox=dict(facecolor='green', alpha=0.1))

plt.xlim((np.min(coord[:, 0]), np.max(coord[:, 0])))
plt.ylim((np.min(coord[:, 1]), np.max(coord[:, 1])))

plt.savefig('q3_word_vectors.png')

key_words = ["the", "unique", "superb", "comedy", "surprisingly"]
inputVectors = wordVectors[:nWords]
inv_tokens = {v: k for k, v in tokens.iteritems()}
for key_word in key_words:
    wordVector = inputVectors[tokens[key_word]]
    idx = knn(wordVector, inputVectors, 11)
    print "Words related to \"" + key_word + "\": ", [
        inv_tokens[i] for i in idx
    ]
	image_data, labels, identity = loadMatFile('labeled_images.mat')

	# # ramdom select data from data set as traina and validation sets
	# data_train, data_valid, label_train, label_valid = train_test_split(image_data, labels, test_size=0.33, random_state=42)

	###################################################################
	# identity k fold #
	###################################################################
	corr = []
	c = {}
	lkf = LabelKFold(identity, n_folds=3)
	for k in kValue:
		for train, valid in lkf:
			train_data, valid_data, train_label, valid_label = generateDataByIndex(train, valid, image_data, labels)

			corr.append(knn(k, train_data, train_label, valid_data, valid_label))
		c[k] = sum(corr) / 3
		corr = []
	print "identity-k-fold with knn:", c

	# n_fold = 3
	plot_title = 'identity-k-fold with knn (n_fold = 3)'
	# c = {8: 52.478632478632484, 2: 51.28205128205128, 4: 53.53846153846154, 10: 52.78632478632479, 6: 53.641025641025635}
	plotCorrectness(c, plot_title)

	algo = {}
	for train, valid in lkf:
		train_data, valid_data, train_label, valid_label = generateDataByIndex(train, valid, image_data, labels)

		name, correctRate = OneVsOne(train_data, valid_data, train_label, valid_label)
		addResult(algo, name, correctRate)