def k_nearst_neighbour_classify(self): knn = KNNClassifier() predictions = knn.predict_classification(self.X_train, self.y_train, self.X_test) accuracy = str(self.main.accuracy(self.y_test, predictions)) accuracy = accuracy[0:4] self.knn_acc_label.setText(accuracy)
def test_knn(self): from classifiers import KNNClassifier if type(self.k) == int: k = "%s" % self.k else: k = "-".join([str(i) for i in self.k]) print("KNNClassifier") print("---" * 45) print("Train num = %s" % self.train_num) print("Test num = %s" % self.test_num) print("K = %s" % k) knn = KNNClassifier(self.train_data, self.train_labels, k=self.k, best_words=self.best_words) classify_labels = [] print("KNNClassifiers is testing ...") for data in self.test_data: classify_labels.append(knn.classify(data)) print("KNNClassifiers tests over.") filepath = "f_runout/KNN-%s-train-%d-test-%d-f-%d-k-%s-%s.xls" % \ (self.type, self.train_num, self.test_num, self.feature_num, k, datetime.datetime.now().strftime( "%Y-%m-%d-%H-%M-%S")) self.write(filepath, classify_labels)
def match_codes(): """Matching full dataset in parallel with customizable embedding method""" # 0.398657046863 hard, 0.275397642306 soft working_directory = './data/' data_codes, data_descriptions = get_data_to_match('slim') official_codes, official_descriptions = get_official_data() level = 1 model = HashingEmbedder( level=level, analyzer='char_wb', ngram_range=(4, 5), norm='l2' ) # word2vecEmbedder() # word2vecEmbedder() # HashingEmbedder() # [HashingEmbedder(level=level, analyzer='char', ngram_range=(3,5), norm='l2')] #[HashingEmbedder(level=level, analyzer='char', ngram_range=(2,3))] model.embed_data(data_descriptions) print 'loaded and embedded data' # test_nNN(model, data_descriptions, data_codes) official_code_labels = None true_data_codes = None use_section = False if use_section: official_code_labels = get_section_codes(model.official_codes) true_data_codes = get_section_codes(data_codes) else: official_code_labels = coarsen_codes(model.official_codes) true_data_codes = coarsen_codes(data_codes) nNN = 4 classifier = KNNClassifier(n_neighbors=nNN) classifier.fit(model.official_embeddings, official_code_labels) pred_codes = classifier.predict( model.data_embeddings, pbar=True ) # classifier.predict_with_edit_dist(model.data_embeddings, data_descriptions, model.official_descriptions) errors = pred_codes - true_data_codes print 'Correctly predicted', 1.0 * np.sum( errors == 0) / errors.shape[0], 'percent of top level codes' # plot_confusion_matrix(true_data_codes, pred_codes) model = word2vecEmbedder() model.embed_data(data_descriptions) classifier = KNNClassifier(n_neighbors=nNN) classifier.fit(model.official_embeddings, official_code_labels) pred_codes = classifier.predict( model.data_embeddings ) # classifier.predict_with_edit_dist(model.data_embeddings, data_descriptions, model.official_descriptions)
def classify(data, classifier, num_classes, train_labels, train_features, test_labels, test_features): """ Function used by FoldRunner to execute classification based on the current classifier :param data: the configuration dictionary :param classifier: current classifier (from the classifiers list in the configuration file) :param num_classes: the number of distinct labels (binary or multiclass classification) :param train_labels: the labels of all train instances :param train_features: the features of all train instances :param test_labels: the labels of all test instances :param test_features: the features of all test instances :return: the confusion matrix of the classification """ if classifier == "NN_keras": return nnk.classify(data, num_classes, train_labels, train_features, test_labels, test_features) elif classifier == "NN_scikit-learn": return nns.classify(train_labels, train_features, test_labels, test_features) elif classifier == "KNN": return knn.classify(data, train_labels, train_features, test_labels, test_features) elif classifier == "NaiveBayes": return nb.classify(train_labels, train_features, test_labels, test_features) elif classifier == "RandomForest": return rf.classify(train_labels, train_features, test_labels, test_features) elif classifier == "LogisticRegression": return lr.classify(train_labels, train_features, test_labels, test_features)
def classify_custom_input(self, custom_input_vector): nb = NaiveBayesClassifier() nb.train(self.X_train, self.y_train) prediction = nb.predict([custom_input_vector]) self.custom_text_nb_label.setText(str(prediction[0])) knn = KNNClassifier() prediction = knn.predict_classification(self.X_train, self.y_train, [custom_input_vector]) self.custom_text_knn_label.setText(str(prediction[0])) rf = SklearnRandomForest() prediction = rf.random_forest(self.X_train, self.y_train, [custom_input_vector]) self.custom_text_dt_label.setText(str(prediction[0])) dt = SklearnDecisionTree() prediction = dt.decision_tree(self.X_train, self.y_train, [custom_input_vector]) self.custom_text_rf_label.setText(str(prediction[0]))
def test_nNN(model, data_descriptions, data_codes, nNNmin=2, nNNmax=10): for nNN in xrange(nNNmin, nNNmax + 1): classifier = KNNClassifier(n_neighbors=nNN) t1 = time() classifier.fit(model.official_embeddings, get_section_codes(model.official_codes)) # classifier.fit(model.official_embeddings, coarsen_codes(model.official_codes)) pred_section_codes = classifier.predict_with_edit_dist( model.data_embeddings, data_descriptions, model.official_descriptions) # true_coarse_codes = coarsen_codes(data_codes) # .reshape((-1,1)) # errors = pred_codes - true_coarse_codes true_section_codes = get_section_codes(data_codes) # .reshape((-1,1)) errors = pred_section_codes - true_section_codes print '------------------------------' print 'nNN:', nNN print 'Correctly predicted', 1.0 * np.sum( errors == 0) / errors.shape[0], 'percent of top level codes w/ edit dist kNN' print 'Took', time() - t1, 'seconds' t1 = time() # pred_codes = classifier.predict(model.data_embeddings) # errors = pred_codes - true_coarse_codes pred_section_codes = classifier.predict(model.data_embeddings) errors = pred_section_codes - true_section_codes print 'Correctly predicted', 1.0 * np.sum( errors == 0) / errors.shape[0], 'percent of top level code w/ euclidean kNN' print 'Took', time() - t1, 'seconds' print '------------------------------'
def test_knn(self): from classifiers import KNNClassifier if type(self.k) == int: k = "%s" % self.k else: k = "-".join([str(i) for i in self.k]) print("KNNClassifier") print("---" * 45) print("Train num = %s" % self.train_num) print("Test num = %s" % self.test_num) print("K = %s" % k) # print self.train_data print (self.train_labels) print (len(self.train_data)) print (self.train_data[0]) knn = KNNClassifier(self.train_data, self.train_labels, k=self.k, best_words=self.best_words) classify_labels = [] print("KNNClassifiers is testing ...") for data in self.test_data: classify_labels.append(knn.classify(data)) print("KNNClassifiers tests over.") filepath = "f_runout/KNN-%s-train-%d-test-%d-f-%d-k-%s-%s.xls" % \ (self.type, self.train_num, self.test_num, self.feature_num, k, datetime.datetime.now().strftime( "%Y-%m-%d-%H-%M-%S")) self.write(filepath, classify_labels)
def KNNClassifier(self, dataset=None, class_column=None, name=None, pipeline=None, K=5, kernel="euclidean", algo="auto", weights="uniform", kernel_params={}): return KNNClassifier(dataset=dataset, class_column=class_column, name=name, pipeline=pipeline, K=K, kernel=kernel, algo=algo, weights=weights, kernel_params=kernel_params, client=self.client)
localAreaDensity=-1, # Using numActiveColumnsPerInhArea numActiveColumnsPerInhArea=64, # All input activity can contribute to feature output stimulusThreshold=0, synPermInactiveDec=synPermDec, synPermActiveInc=synPermInc, synPermConnected=synPermConn, maxBoost=1.0, seed=1956, # The seed that Grok uses spVerbosity=1) # Instantiate the spatial pooler test bench. tb = VisionTestBench(sp) # Instantiate the classifier clf = KNNClassifier() # Train the spatial pooler on trainingVectors. numCycles = tb.train(trainingVectors, trainingTags, clf, maxTrainingCycles, minAccuracy) # Save the permanences and connections after training. #tb.savePermanences('perms.jpg') #tb.showPermanences() #tb.showConnections() # Get testing images and convert them to vectors. testingImages, testingTags = data.getImagesAndTags(testingDataset) testingVectors = encoder.imagesToVectors(testingImages) # Reverse the order of the vectors and tags for testing
if len(sys.argv) < 2: terminate() else: mode = sys.argv[1] if mode not in func_mode_list: terminate() def show_plot_sample(): fig = plt.figure(figsize=(8, 8)) fig.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace=0.05, wspace=0.05) for i in tqdm(range(25)): id = random.randint(0, len(testX) - 1) images = np.reshape(testX[id], [28, 28]) ax = fig.add_subplot(5, 5, i + 1, xticks=[], yticks=[]) ax.imshow(images, cmap=plt.cm.binary, interpolation='nearest') ax.text(0, 2, "label:" + str(testY[id])) ax.text(0, 4, "predict:" + str(knn.predict(testX[id]))) plt.show() if __name__ == '__main__': trainX, trainY, testX, testY = load_mnist() knn = KNNClassifier(train_data=trainX, train_labels=trainY, ord=2) if mode == 'run_sample': show_plot_sample() else: knn.test_acc(test_data=testX, test_label=testY, K=1)