def run_knn(train_in, train_targ, valid_in, valid_targ, test_in, test_targ): ''' Run k-NN, where k ranges over K_VALUES and plot performance. ''' train_rates = [] valid_rates = [] test_rates = [] for k in K_VALUES: print "Running {}-NN.".format(k) train_prediction = knn(train_in, train_targ, train_in, k) valid_prediction = knn(train_in, train_targ, valid_in, k) test_prediction = knn(train_in, train_targ, test_in, k) train_rates += [classification_rate(train_prediction, train_targ)] valid_rates += [classification_rate(valid_prediction, valid_targ)] test_rates += [classification_rate(test_prediction, test_targ)] print "TRAINING CLASSIFICATION RATE = {}".format(train_rates[k-1]) print "VALIDATION CLASSIFICATION RATE = {}".format(valid_rates[k-1]) print "TEST CLASSIFICATION RATE = {}".format(test_rates[k-1]) best_k = K_VALUES[np.argmax(valid_rates)] print "MOST ACCURATE MODEL : {}-NN".format(best_k) test_prediction = knn(train_in, train_targ, test_in, best_k) test_classification = classification_rate(test_prediction, test_targ) print "TEST CLASSIFICATION RATE = {}".format(test_classification) plt.title("Classification of Actors using k-NN") plt.xlabel("k") plt.axis([np.min(K_VALUES), np.max(K_VALUES), 0, 100]) plt.grid(True) plt.ylabel("Training Classification Rate") plt.plot(K_VALUES, train_rates, 'go') plt.show() plt.title("Classification of Actors using k-NN") plt.xlabel("k") plt.axis([np.min(K_VALUES), np.max(K_VALUES), 0, 100]) plt.ylabel("Validation Classification Rate") plt.plot(K_VALUES, valid_rates, 'ro') plt.show() plt.title("Classification of Actors using k-NN") plt.xlabel("k") plt.axis([np.min(K_VALUES), np.max(K_VALUES), 0, 100]) plt.grid(True) plt.ylabel("Test Classification Rate") plt.plot(K_VALUES, test_rates, 'bo') plt.show()
def uxval(start,stop,data,rows,f,z,m,k,a): rmax = len(rows) test = [] temp = "" makeTable(colname[z],"train") for r in range(0, rmax): d = rows[r] if r >= start and r < stop: test.append(d) else: addRow(d,"train") #zeror(test, data, hypotheses, z) #xvalTest1(test,data,hypotheses) #nb(test,data,hypotheses,z,k,m) knn(test,data,"train",a,k)
def findKNN(Q: list, tree: QuadTree, K: int, datasetDict: dict): res = [] for qi in Q: #查找最临近的K个点 resi = knn(datasetDict[qi], K, tree, datasetDict) res.append(resi) return res
def fit(self): for i in range(self.labels_num): y = 0 for j in range(self.train_data_num): if self.train_target[j][i] == 1: y = y + 1 self.Ph1[i] = (self.s + y)/(self.s*2 + self.train_data_num) self.Ph0 = 1 - self.Ph1 for i in range(self.labels_num): c1 = np.zeros((self.k + 1,)) c0 = np.zeros((self.k + 1,)) for j in range(self.train_data_num): temp = 0 KNN = knn(self.train_data, j, self.k) for k in range(self.k): if self.train_target[int(KNN[k])][i] == 1: temp = temp + 1 if self.train_target[j][i] == 1: c1[temp] = c1[temp] + 1 else: c0[temp] = c0[temp] + 1 for l in range(self.k + 1): self.Peh1[i][l] = (self.s + c1[l])/(self.s*(self.k + 1) + c1.sum()) self.Peh0[i][l] = (self.s + c0[l])/(self.s*(self.k + 1) + c0.sum())
def main(): testing = transform(pd.read_csv('Bankruptcy/testing_zScore.csv', header=None)) training = transform(pd.read_csv('Bankruptcy/training_zScore.csv', header=None)) between = find_between_var(testing) within = find_within_var(testing) w = np.real(projector(between, within)) for data in testing: data[1] = np.dot(np.array(data[1]), w) for data in training: data[1] = np.dot(np.array(data[1]), w) knn(1, testing, training) knn(3, testing, training) knn(5, testing, training) knn(7, testing, training)
def main(): #Kmeans kmeans_machine = kmeans(sample, k) kmeans_machine.train() #KNN knn_machine = knn(test, sample, target, k) print knn_machine.train() #Decision tree #SVM svm_run(Dataset, Trainset)
def run(self): self.read_input() algo = self.options['algo'] if algo == 'knn': classifier = knn(self.doc) elif algo == 'dtree': classifier = dtree(self.doc) elif algo == 'bnb': classifier = bnb(self.doc) elif algo == 'gnb': classifier = gnb(self.doc) elif algo == 'mnb': classifier = mnb(self.doc) classifier.evaluate()
def crossValidation(data, n): ac = 0 confusion = {} for a in range(len(data)): test = data.pop(0) model = knn(data, n) result = model.classify(test['Data']) if(result == test['Type']): ac+=1 if test['Type'] not in confusion.keys(): confusion[test['Type']] = {} if result not in confusion[test['Type']].keys(): confusion[test['Type']][result] = 0 confusion[test['Type']][result] += 1 data.append(test) confusion.keys() ac /= len(data) print("Acuracia: " + str(ac*100) + "%") for i in confusion.keys(): print(i, end = '') print(confusion[i])
from scipy.io import loadmat from naive_bayes import * from knn import * from linear_regression import * accuracy = lambda pred, actual: sum(pred.ravel() == actual.ravel()) / len(pred) XTrain = loadmat('./data/Iris/XTrainIris.mat')['XTrain'] yTrain = loadmat('./data/Iris/yTrainIris.mat')['yTrain'] XTest = loadmat('./data/Iris/XTestIris.mat')['XTest'] yTest = loadmat('./data/Iris/yTestIris.mat')['yTest'] yTestPred = knn( XTrain, yTrain, XTest ) print('knn:', accuracy(yTestPred, yTest)) yTestPred = naiveBayesClassify( XTrain, yTrain, XTest ) print('NB:', accuracy(yTestPred, yTest)) XTrain = loadmat('./data/WBC/XTrainWBC.mat')['XTrain'] yTrain = loadmat('./data/WBC/yTrainWBC.mat')['yTrain'] XTest = loadmat('./data/WBC/XTestWBC.mat')['XTest'] yTest = loadmat('./data/WBC/yTestWBC.mat')['yTest'] yTestPred = knn( XTrain, yTrain, XTest ) print('knn:', accuracy(yTestPred, yTest)) yTestPred = naiveBayesClassify( XTrain, yTrain, XTest ) print('NB:', accuracy(yTestPred, yTest)) XTrain = loadmat('./data/Challenge/XTrain.mat')['XTrain']
def _nb1(): with settings(LIB, seed=2), settings(TABLE, era=3): knn("weather.csv")
data = pd.read_csv("Movies_training_classification.csv") X_train, Y_train, x_test, y_test = Classi_PreProcessing(data) ch = int(input("Please choose a classifer 1-Logestic regression 2-KNN 3-SVM : ")) if ch == 1: try: load(x_test,y_test) except (OSError, IOError) as e: #model = logistic_regression(X_train,Y_train,x_test,y_test) model = logistic_regression(X_train,Y_train) load(x_test,y_test) elif ch == 2: try: load(x_test,y_test) except (OSError, IOError) as e: model = knn(X_train, Y_train) load(x_test,y_test) else: try: load(x_test,y_test) except (OSError, IOError) as e: model = SVM(X_train, Y_train) load(x_test,y_test) if choice == 2: data = pd.read_csv("Movies_training.csv")
help='test_num for knn') if __name__ == '__main__': args = parser.parse_args() if args.exp_name == 'knn': args.split = 'both' # load data if args.dataset == 'mnist': data = Mnist(args) else: data = Mnist(args) # perform experiment if args.exp_name == 'perceptron': X, y = extract_1_5(data.X, data.y) # normalize to [-1, 1] lower_bound = -1 * np.ones(X.shape) X = lower_bound + 2 * (X / (X.max() - X.min())) perceptron(X, y, args) elif args.exp_name == 'knn': acc = [0 for _ in range(args.k_range)] # perform knn and "k" search for i in range(args.k_range): acc[i] = knn(data, args) args.k = args.k + 1 draw_search(acc, args)
visualizeIdx = [tokens[word] for word in visualizeWords] visualizeVecs = wordVectors[visualizeIdx, :] temp = (visualizeVecs - np.mean(visualizeVecs, axis=0)) covariance = 1.0 / len(visualizeIdx) * temp.T.dot(temp) U, S, V = np.linalg.svd(covariance) coord = temp.dot(U[:, 0:2]) for i in range(len(visualizeWords)): plt.text(coord[i, 0], coord[i, 1], visualizeWords[i], bbox=dict(facecolor='green', alpha=0.1)) plt.xlim((np.min(coord[:, 0]), np.max(coord[:, 0]))) plt.ylim((np.min(coord[:, 1]), np.max(coord[:, 1]))) plt.savefig('word_vectors.png') test_words = [ "rain", "snow", "cool", "bad", "worth", "coffee", "tea", "man", "queen", "hail" ] inputVectors = wordVectors[:nWords] inv_tokens = {v: k for k, v in tokens.items()} for test_word in test_words: wordVector = inputVectors[tokens[test_word]] pred_idx = knn(wordVector, inputVectors, 10) print("The similar word \"" + test_word + "\": ", [inv_tokens[i] for i in pred_idx])
from knn import * from sklearn.model_selection import train_test_split df = load_data_set('data/PhishingData.arff') trainSet, testSet = train_test_split(df, test_size=0.2, random_state=42, shuffle=False) trainSet = trainSet.values.tolist() testSet = testSet.values.tolist() for k in range(2, 33): knn_clf = knn(k) knn_clf.fit(trainSet, trainSet) hyp = knn_clf.predict(testSet) score = get_accuracy(testSet, hyp) print("k=", k, 'Score=', score)
import knn model = knn('datingTestSet') model.train() x=[1,1,1];k=3 res=model.test(x,k)
from knn import * LOOPS_COUNT = 10 CLUSTERS_COUNT = 16 im = Image.open('lenna.png') without_alpha = lambda x: x[:-1] arr = list(map(without_alpha, im.getdata())) centers, colors = knn(arr, LOOPS_COUNT, CLUSTERS_COUNT) res_coords = fill_with_centers(arr, centers, colors) print(res_coords.shape) print(np.array(colors).shape) res_img = Image.fromarray(res_coords) res_img.show() res_img.save('result_png.png')
def _knn1(): knn('data/diabetes.arff', 'data/diabetes.arff').report()
from knn import * from sampling import * import pandas as pd datasetwl = pd.read_csv(".\Datasets\dataset_data_mining_course.csv", header=None) datasetwl = np.array(datasetwl) dataset = datasetwl[:, 0:2] labels = datasetwl[:, 2] seeds = [[5.0, 5.0], [-10.0, -10.0]] seedlabels = [0, 1] # 样本本身的分布 show_clusters_with_label(dataset, labels) # 随机采样 random_samples, random_sampleslabels = simple_random_sampling(dataset, labels) show_clusters_with_label(random_samples, random_sampleslabels) knn(10, dataset, random_samples, random_sampleslabels, labels=labels) # 新的采样方法(采样过程较慢) samples, sampleslabels = new_sampling(dataset, seeds, labels, seedlabels) show_clusters_with_label(samples, sampleslabels) knn(5, dataset, samples, sampleslabels, labels=labels)
def test_8020(T, k, normalized=True, debug=False, dataset_title='', num_tests=1): fig, ax = plt.subplots() ax.set(xlabel='Test Run #', ylabel='Error Rate (%)', title='5-NN Performance for ' + dataset_title) ax.grid() if normalized: T = normalize(T) UW_error_rates = [] W_error_rates = [] # make the run look sick pbar = ProgressBar() for simulation_x in pbar(range(num_tests)): # randomize testing and training set (T, sample_size) = stratify(T) # say sample_size on first go if simulation_x == 0: print(F'Testing {sample_size} examples') print(F'Running {num_tests} simulations') test_set = T[:sample_size + 1] training_set = T[sample_size + 1:] unweighted_errors = 0 weighted_errors = 0 for x in test_set: # separate input and class for readability x_input = x[:-1] actual_class = x[-1] if debug: print('-' * 20) result_unweighted = knn(training_set, x_input, k, weighted=False, debug=debug) result_weighted = knn(training_set, x_input, k, weighted=True, debug=debug) if result_unweighted != actual_class: if debug: print( F'KNN unweighted classified X as {result_unweighted} when it should be {actual_class}' ) unweighted_errors += 1 if result_weighted != actual_class: if debug: print( F'KNN weighted classified X as {result_weighted} when it should be {actual_class}' ) weighted_errors += 1 if debug: print('-' * 20) UW_error_rates.append(100 * unweighted_errors / sample_size) W_error_rates.append(100 * weighted_errors / sample_size) average_UW_error_rate = sum(UW_error_rates) / num_tests average_W_error_rate = sum(W_error_rates) / num_tests print( F'Average error rate for unweighted knn is {average_UW_error_rate:.6f}' ) print(F'Average error rate for weighted knn is {average_W_error_rate:.6f}') # add some extra space between runs print('\n') # plot uweighted and weighted error rates ax.plot([x + 1 for x in range(num_tests)], UW_error_rates, label='unweighted error rates', alpha=0.5) ax.plot([x + 1 for x in range(num_tests)], W_error_rates, label='weighted error rates', alpha=0.7) ax.legend() plt.show()
print 'Data Train: ', len(train_idx) print 'Data Test: ', len(test_idx) i = i + 1 X_train = dataset.data[train_idx] X_test = dataset.data[test_idx] y_train = dataset.target[train_idx] y_test = dataset.target[test_idx] row = [] row_f = [] # classifier if (classifier[0]): e_knn, f_knn = knn(k, dataset.data[train_idx], dataset.data[test_idx], dataset.target[train_idx], dataset.target[test_idx]) err_knn = err_knn + [e_knn] row = row + [e_knn] f1_knn = f1_knn + [f_knn] row_f = row_f + [f_knn] print 'knn finished' if (classifier[1]): e_wknn, f_wknn = wknn(k, dataset.data[train_idx], dataset.data[test_idx], dataset.target[train_idx], dataset.target[test_idx]) err_wknn = err_wknn + [e_wknn] row = row + [e_wknn]
visualizeWords = [ "great", "cool", "brilliant", "wonderful", "well", "amazing", "worth", "sweet", "enjoyable", "boring", "bad", "dumb", "annoying", "female", "male", "queen", "king", "man", "woman", "rain", "snow", "hail", "coffee", "tea" ] visualizeIdx = [tokens[word] for word in visualizeWords] visualizeVecs = wordVectors[visualizeIdx, :] temp = (visualizeVecs - np.mean(visualizeVecs, axis=0)) covariance = 1.0 / len(visualizeIdx) * temp.T.dot(temp) U, S, V = np.linalg.svd(covariance) coord = temp.dot(U[:, 0:2]) for i in range(len(visualizeWords)): plt.text(coord[i, 0], coord[i, 1], visualizeWords[i], bbox=dict(facecolor='green', alpha=0.1)) plt.xlim((np.min(coord[:, 0]), np.max(coord[:, 0]))) plt.ylim((np.min(coord[:, 1]), np.max(coord[:, 1]))) plt.savefig('word_vectors.png') knn_matrix = visualizeVecs knn_vector = wordVectors[0] near_idx = knn(knn_vector, knn_matrix, 5) nearest = [visualizeWords[i] for i in near_idx] print("the nearest words to", visualizeWords[0], "are", nearest)
def generateQuery(K: int, tree: QuadTree, datasetDict: dict) -> list: begin = random.randint(0, tree.root.pointsNum - 1) #找最临近的另外两个点 加上初始点 作为查询点集合 res = knn(datasetDict[begin], K, tree, datasetDict) return res
def _nb1(): with settings(LIB,seed=2),settings(TABLE,era=3): knn("weather.csv")
def _knn(): knn('data/weather.arff', 'data/weather.arff').report()
"enjoyable", "boring", "bad", "waste", "dumb", "annoying" ] visualizeIdx = [tokens[word] for word in visualizeWords] visualizeVecs = wordVectors[visualizeIdx, :] temp = (visualizeVecs - np.mean(visualizeVecs, axis=0)) covariance = 1.0 / len(visualizeIdx) * temp.T.dot(temp) U, S, V = np.linalg.svd(covariance) coord = temp.dot(U[:, 0:2]) for i in xrange(len(visualizeWords)): plt.text(coord[i, 0], coord[i, 1], visualizeWords[i], bbox=dict(facecolor='green', alpha=0.1)) plt.xlim((np.min(coord[:, 0]), np.max(coord[:, 0]))) plt.ylim((np.min(coord[:, 1]), np.max(coord[:, 1]))) plt.savefig('q3_word_vectors.png') key_words = ["the", "unique", "superb", "comedy", "surprisingly"] inputVectors = wordVectors[:nWords] inv_tokens = {v: k for k, v in tokens.iteritems()} for key_word in key_words: wordVector = inputVectors[tokens[key_word]] idx = knn(wordVector, inputVectors, 11) print "Words related to \"" + key_word + "\": ", [ inv_tokens[i] for i in idx ]
image_data, labels, identity = loadMatFile('labeled_images.mat') # # ramdom select data from data set as traina and validation sets # data_train, data_valid, label_train, label_valid = train_test_split(image_data, labels, test_size=0.33, random_state=42) ################################################################### # identity k fold # ################################################################### corr = [] c = {} lkf = LabelKFold(identity, n_folds=3) for k in kValue: for train, valid in lkf: train_data, valid_data, train_label, valid_label = generateDataByIndex(train, valid, image_data, labels) corr.append(knn(k, train_data, train_label, valid_data, valid_label)) c[k] = sum(corr) / 3 corr = [] print "identity-k-fold with knn:", c # n_fold = 3 plot_title = 'identity-k-fold with knn (n_fold = 3)' # c = {8: 52.478632478632484, 2: 51.28205128205128, 4: 53.53846153846154, 10: 52.78632478632479, 6: 53.641025641025635} plotCorrectness(c, plot_title) algo = {} for train, valid in lkf: train_data, valid_data, train_label, valid_label = generateDataByIndex(train, valid, image_data, labels) name, correctRate = OneVsOne(train_data, valid_data, train_label, valid_label) addResult(algo, name, correctRate)