def mind_kl(data, dist_table, K, dim, dist_type): N = dist_table.shape[0] # compute pr print('compute pr') pr = get_pr(dist_table, K) print('pr=%.2f' % np.min(pr)) del dist_table # compute pr # compute pdr print('compute pdr') kld = sys.maxsize d = 1 for i in range(dim): sample_id = np.random.choice(dim, i + 1, replace=False) samples = data[:, sample_id] nb = knn.KNN(K, dist_type=dist_type, data=samples) dtable = nb.get_dist_multip(False) pdr = get_pr(dtable, K) cur_kld = math.log(N / (N - 1)) + np.mean(np.log(np.divide(pr, pdr))) if cur_kld < kld: kld = cur_kld d = i + 1 print('[%d\%d]: kld=%.2f' % (i, dim, kld)) # compute pdr return d
def main(): vertices ={} print("Read vertices from file") for line in open(args.vertices_file): v = graph_f.Vertex(line) vertices[v.name] = v print("Number of Vertices: {}".format(len(vertices))) print("Loading KNN graph") knn_graph = knn.KNN(vertices, sys.maxsize, args.knn_graph_file).GetMatrix(args.knn_distance_threshold) print("Loading projections") initial_vertex_projections, all_pos = LoadProjections(args.projections, vertices) uniform_pos = {pos:1/len(all_pos) for pos in all_pos} current_projections = initial_vertex_projections for i in range(args.num_iterations): print("Iteration:", i+1) new_projections = {} for v in vertices.values(): if v in initial_vertex_projections: new_projections[v] = initial_vertex_projections[v] continue nn_array = [(nn, 1-dist) for (nn, dist) in knn_graph[v]] nominator = MulScalarByVector(args.nu, uniform_pos) denominator = args.nu for nn, weight in nn_array: nn_pos_vector = current_projections.get(nn, uniform_pos) nominator = AddVector(nominator, MulScalarByVector(weight, nn_pos_vector)) denominator += weight new_projections[v] = MulScalarByVector(1/denominator, nominator) current_projections = new_projections
def example_knn(): # An example of how to use knn print "*" * 60 print "*" * 16, "An Example of knn's Usage", "*" * 17 print "*" * 60 data1 = [(3, 5), (2, 3), (5, 4), (9, 6), (4, 7), (8, 1), (7, 2), (8, 8)] data = [] for i in data1: data.append({0: i[0], 1: i[1]}) label = [1, 1, 1, 0, 1, 0, 1, 0] m = knn.KNN(data, label, dimensions=2) print "Samples:", m.train_data print "\nLabel prb:", m.class_prb # print m.decision() print "\n\nvisualize the kd-tree: " m.visualize_kdtree() f = ds.EuclideanDistance print "the label of point", {0: 9, 1: 9}, "is", print m.classify(point={0: 9, 1: 9}, k=3, dist=f, prbout=1) print "the label of point", {0: 2, 1: 8}, "is", print m.classify(point={0: 2, 1: 8}, k=3, dist=f, prbout=1) knn.saveknn(m, 'testknn.pkl') # Pickle test print "*" * 60 print "Load knn model from file: 'testknn.pkl'" n = knn.loadknn('testknn.pkl') print "Samples:", n.train_data print "\nLabel prb:", n.class_prb # print n.decision() print "\n\nvisualize the kd-tree: " n.visualize_kdtree()
def GradeTest(): #increase rate rate = 0.30 #read the file and get the values df = pd.read_csv('studentsgrade.csv') numdf = df.values #get the last col, get the number of rows and columns in the dataframe row_num = numdf.shape[0] col_num = numdf.shape[1] grade = numdf[:, col_num - 1] features = numdf[:, :col_num - 1] #number of test data, initialize the error counter test_data = int(row_num * rate) error_count = 0.0 #normalization the features norm_features, therange, mincols = nm.Norm(features) df_test = norm_features[test_data:row_num, :] label = grade[test_data:row_num] for i in range(test_data): classified_result = knn.KNN(df_test, label, norm_features[i, :], 3) print('The classifiier returned {}. The real answer is: {}'.format( classified_result, grade[i])) if (classified_result != grade[i]): error_count += 1.0 print('Total Error rate is: {}. '.format(error_count / float(test_data)))
def integrate(train_data, train_label, test_data, test_label): # SVM Svm = svm.SVM(train_data, train_label, test_data, test_label) test_result_svm = Svm.classify() # print(test_result_svm) print("SVM over.") # KNN邻近法 kn = knn.KNN(train_data, train_label, 7) test_result_kn = kn.work(test_data) # print(test_result_kn) print("KNN over.") # 神经网络 net = network.Network([400, 25, 10], train_data, train_label, test_data, test_label) test_result_bp = net.SGD(30, 10, 3.0) # print(test_result_bp) print("BP over.") num = len(test_data) err = 0 for i in range(num): e = np.zeros((10, 1)) e[test_result_svm[i]] += 1 e[test_result_kn[i]] += 1 e[test_result_bp[i]] += 1 tmp_class = np.argmax(e) if tmp_class != test_label[i][0]: err += 1 # print(err) print('accuracy:', 1 - 1.0 * err / num)
def ClassifyGrade(inArr): #read the file and get the values df = pd.read_csv('mlscripts/studentsgrade.csv') numdf = df.values #get the last col, get the number of rows and columns in the dataframe row_num = numdf.shape[0] col_num = numdf.shape[1] grade = numdf[:, col_num - 1] features = numdf[:, :col_num - 1] #normalization the features norm_features, therange, mincols = nm.Norm(features) sub = inArr - mincols[:, None] inputs = sub / therange[:, None] classified_result = knn.KNN(features, grade, inArr, 3) #predicted_grade = grade[classified_result - 1] return (classified_result) #asn = ClassifyGrade([3,5,60, 9, 3]) #print(asn)
def cal_5fold(k, dist_type, train_data): f_5 = open("result_02_5fold.txt", "a") print("Saving...") clf_5 = knn.KNN(k, dist_type) f_5.write("============================================\n\n") f_5.write("# of K: %d\n" % k) if dist_type == 'e' or dist_type == 'E': f_5.write("Distance Type: Euclidean\n") elif dist_type == 'm' or dist_type == 'M': f_5.write("Distance Type: Manhattan\n") elif dist_type == 'l' or dist_type == 'L': f_5.write("Distance Type: L∞\n") c_arr = np.array([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.]) acc_test_arr = np.array([]) pre_test_arr = np.array([]) re_test_arr = np.array([]) f1_test_arr = np.array([]) t_data = train_data np.random.shuffle(t_data) for i in range(5): train5_data = np.delete(t_data, np.s_[160 * i:160 * (i + 1)], axis=0) test5_data = t_data[160 * i:160 * (i + 1)] clf_5.train(train5_data) a_arr_test5 = test5_data[:, 0] p_arr_test5 = np.array([]) for row in test5_data: p_arr_test5 = np.append(p_arr_test5, clf_5.predict(row)) acc_test_arr = np.append(acc_test_arr, cal_accuracy(a_arr_test5, p_arr_test5, c_arr)) pre_test_arr = np.append( pre_test_arr, cal_precision(a_arr_test5, p_arr_test5, c_arr)) re_test_arr = np.append(re_test_arr, cal_recall(a_arr_test5, p_arr_test5, c_arr)) f1_test_arr = np.append(f1_test_arr, cal_f1(a_arr_test5, p_arr_test5, c_arr)) f_5.write("\n5-fold Cross Validation Metrics\n") f_5.write("Accuracy: %.4lf\n" % np.average(acc_test_arr)) f_5.write("Precision: %.4lf\n" % np.average(pre_test_arr)) f_5.write("Recall: %.4lf\n" % np.average(re_test_arr)) f_5.write("F-1 Score: %.4lf\n" % np.average(f1_test_arr)) f_5.write("\n============================================\n\n") print("Done!") f_5.close() del clf_5 return
def main(): vertices = collections.defaultdict(Vertex) # key -trigram tuple if args.f or not os.path.exists(args.vertices_file): corpus = Vertex() print("Loading tri-grams...") for line in open(args.corpus): fivegrams = LineToNgrams(line, 5) for fivegram in fivegrams: vertices[fivegram[1:-1]].Update(fivegram) corpus.Update(fivegram) print("Number of Vertices: {}".format(len(vertices))) print("Updating PMI...") for trigram, vertex in vertices.items(): vertex.UpdatePMI(corpus) print("Normalizing features") Normalize(vertices, corpus) print("Write vertices to file") with open(args.vertices_file, "w") as f: for v in vertices.values(): f.write(v.dumps()) else: print("Read vertices from file") for line in open(args.vertices_file): v = Vertex(line) vertices[v.name] = v print("Number of Vertices: {}".format(len(vertices))) ###### DEBUG #DebugFindKNN('have to do', 10, vertices) #import pdb; pdb.set_trace() ###### DEBUG END if args.f or not os.path.exists(args.graph_file): print("Building KNN graph") knn_graph_builder = knn.KNN(vertices, args.k) knn_matrix = knn_graph_builder.Run(args.graph_file) else: print("Loading KNN graph") knn_graph_builder = knn.KNN(vertices, args.k, args.graph_file)
def run_test(k, dist_f, kernel_f, x_train, y_train, name): result = np.zeros((n_classes, n_classes)) margins = [] model = knn.KNN(x_train, y_train, k=k, dist_f=dist_f, kernel_f=kernel_f) for i, sample in x_test.iterrows(): predict_y, margin = model.run(sample, y_test[i]) result[predict_y][y_test[i]] += 1 margins.append(margin) print(result) print("{} : k={}, dist={}, kernel={}, name={}".format(kernels.f_score(result)[0], k, dist_f.__name__, kernel_f.__name__, name)) sns.kdeplot(margins)
def TestKNN(train, test, algoType, K): accuracy = 0.0 startTime = time.time() if algoType == "normal": accuracy = algo.KNN(train, test, K) elif algoType == "manhattan": accuracy = algo.KNNManhattan(train, test, K) elif algoType == "minkow": accuracy = algo.KNNMinkow(train, test, K) runTime = time.time() - startTime return accuracy, runTime
def setUp(self): samples = [ ["bottom_left", (0.5, 1.0)], ["bottom_left", (1.5, 1.0)], ["bottom_left", (1.5, 1.5)], ["top_right", (9.5, 8.0)], ["top_right", (8.5, 7.0)], ["top_right", (10.5, 9.0)], ] k = 2 target = (2.0, 3.0) knn = my_knn.KNN(samples, target, k) self.knn = knn
def garbage_classifier(training_data_folder, test_data_folder, k): training_labels = preprocessor.get_labels(training_data_folder) # HU DESCRIPTOR #vectorized_training_data = descriptor.Hu_descriptor(training_data_folder) #vectorized_test_data = descriptor.Hu_descriptor(test_data_folder) # ORB DESCRIPTOR vectorized_training_data = descriptor.ORB_descriptor(training_data_folder) vectorized_test_data = descriptor.ORB_descriptor(test_data_folder) knn_obj = knn.KNN(k) knn_obj.train(vectorized_training_data, training_labels) # MANHATTAN #predicted_labels = knn_obj.predict_by_Manhattan(vectorized_test_data) # EUCLIDEAN #predicted_labels = knn_obj.predict_by_Euclidean(vectorized_test_data) # COSINE predicted_labels = knn_obj.predict_by_Cosine(vectorized_test_data) # HAMMING #predicted_labels = knn_obj.predict_by_Hamming(vectorized_test_data) #print(" predict | actual") #print('------------------------') display = np.hstack( (predicted_labels, preprocessor.get_labels(test_data_folder))) #print(display) #print(display.shape) count = 0 for num in range(0, 417): if display[num, 0] == display[num, 1]: count = count + 1 #print(count) print("Accuracy of prediction:") print(count / 417) return predicted_labels
def main(): epsilon = 0.01 max_iter = 200 # mu = [0, 0, 0] # cov = [[1, 0, 0], [0, 100, 0], [0, 0, 100]] # data = np.random.multivariate_normal(mu, cov, 1000) args = config.parse_args() if args.data_filename.endswith('npy'): data = np.load(args.data_filename) if args.data_filename.endswith('mat'): data = scipy.io.loadmat(args.data_filename) data = data['feat'] if args.data_filename.endswith('npz'): data = np.load(args.data_filename) data = data['feat'] nrof_image = data.shape[0] dim = data.shape[1] # compute and sort distance matrix if args.if_dist_table: obj = knn.KNN(128, args.dist_table_filename, args.data_filename, args.dist_type, args.if_norm) obj.get_dist_multip() # compute and sort distance matrix # load distance matrix if you already have one (comment the lines to compute matrix) else: dist_table = np.load(args.dist_table_filename) # load distance matrix if you already have one # get dimension K_array = [4, 7, 9, 15, 21, 30, 70, 90, 128] for K in K_array: print('compute dimension') dim_est = Dimest(data, K, epsilon, max_iter, dist_table[:,1,0:K]) i,d0,d2 = dim_est.get_dim() print('iteration {}: dim0 = {}; dim = {}'.format(i,d0,d2)) respath = os.path.join(args.resfolder, 'knn_dim.txt') with open(respath, 'a') as f: f.write('K=%d: iteration=%d; dim0=%.2f, dim=%.2f;\n' % (K, i, d0, d2))
def test(train_data, test_data, k, metric): all_ret_classes = [] correct_answers = 0 wrong_answers = 0 knn_instance = knn.KNN(train_data, k, metric) for test_instance in test_data: answer_class = test_instance[2] ret_class = knn_instance.compute_class( (test_instance[0], test_instance[1])) all_ret_classes.append(ret_class) if answer_class == ret_class: correct_answers += 1 else: wrong_answers += 1 print("{};{}".format(k, correct_answers / (correct_answers + wrong_answers))) return all_ret_classes, correct_answers / (correct_answers + wrong_answers)
def main(): args = config.parse_args() # load data if args.data_filename.endswith('npy'): data = np.load(args.data_filename) if args.data_filename.endswith('mat'): data = scipy.io.loadmat(args.data_filename) data = data['feat'] if args.data_filename.endswith('npz'): data = np.load(args.data_filename) data = data['feat'] nrof_image = data.shape[0] dim = data.shape[1] # load data # compute and sort distance matrix if args.if_dist_table: obj = knn.KNN(128, args.dist_table_filename, args.data_filename, args.dist_type, args.if_norm) obj.get_dist_multip() # compute and sort distance matrix # load distance matrix if you already have one (comment the lines to compute matrix) else: dist_table = np.load(args.dist_table_filename) # load distance matrix if you already have one # compute dimension # K_array = [4] K_array = [4, 7, 9, 15, 21, 30, 70, 90, 128] for K in K_array: d = idea(dist_table, K) # d = mind_kl(data, dist_table, K, dim, 'Arclength') print('K={}: dim = {}'.format(K, d)) respath = os.path.join(args.resfolder, 'idea_dim.txt') with open(respath, 'a') as f: f.write('K=%d: dim=%.2f;\n' % (K, d))
def q1(): KNN = knn.KNN() KNN.load_data("GSE25628_filtered_expression.txt", "GSE25628_samples.txt") k = 3 FN = [.05, .1, .25, .5, .75, .9, 1] vals = [] xs = [] ys = [] for fn in FN: (s, sp) = KNN.calc_metrics(k, fn) xs.append(1 - sp) ys.append(s) # print(xs, ys) plt.scatter(xs, ys) plt.title("ROC curve") plt.xlabel("1 - Specificity") plt.ylabel("Sensitivity") plt.show()
import dumbClassifiers as du import datasets as data import runClassifier as run import numpy import knn #9 curve = run.learningCurveSet(knn.KNN({'isKNN':True,'K':5}),data.DigitData) run.plotCurve('K-Nearest Neighbor on 5-NN; DIgitsData',curve) #11 curve = run.hyperparamCurveSet(knn.KNN({'isKNN':True}), 'K', [1,2,3,4,5,6,7,8,9,10],data.DigitData) run.plotCurve('Hyperparameter Curve on DigitsData',curve) #12 arr = [] counter = 1 while counter < 20: arr.append(counter) counter += .5 curve = run.hyperparamCurveSet(knn.KNN({'isKNN':False}), 'eps', arr ,data.DigitData) run.plotCurve('Hyperparameter Curve on DigitsData',curve)
def get_distTable(args): obj = knn.KNN(128, args.dist_table_filename, args.data_filename, args.dist_type, args.if_norm) obj.get_dist_multip()
X[:, :, 2, :] = allZCoordinates - meanValue return X train3 = ReduceData(train3) train4 = ReduceData(train4) test3 = ReduceData(test3) test4 = ReduceData(test4) train3 = CenterData(train3) train4 = CenterData(train4) test3 = CenterData(test3) test4 = CenterData(test4) trainX, trainy = ReshapeData(train3, train4) testX, testy = ReshapeData(test3, test4) knn = knn.KNN() knn.Use_K_Of(15) knn.Fit(trainX, trainy) correctPredictions = 0 for row in range(0, 2000): actualClass = testy[row] prediction = knn.Predict(testX[row]) if (actualClass == prediction): correctPredictions = correctPredictions + 1 print(correctPredictions) print((correctPredictions / 2000) * 100)
animals = ["cats", "dogs", "panda"] for anm in animals: os.chdir(rootdir) chgdir = os.getcwd() + os.sep + anm os.chdir(chgdir) dir = os.getcwd() output_dir = dir + os.sep + "resized" resizeImage(dir, output_dir=output_dir) (trainX_a, testX, trainY_a, testY) = train_test_split(img_dataset, img_labels, test_size=.20, random_state=0) (trainX, valX, trainY, valY) = train_test_split(trainX_a, trainY_a, test_size=.125, random_state=0) max_k = 2 print("running test vs. validation") Ypred_val = [] for i in range(1, max_k + 1): #load class knn to find best value for k knn_val = knn.KNN(i) #load training data into model knn_val.train(npy.asarray(valX), npy.asarray(valY)) #get the prediction for validation Ypred_val.append(knn_val.predict(npy.asarray(testX))) print("ypred val", Ypred_val)
N_TRAIN = 175 K = 1 # Read data from file with open(FILE, "r") as data_csv: data = csv.reader(data_csv) trainset = list() trainlabels = list() rows = [row for row in data] random.shuffle(rows) for row in rows: trainlabels.append(float(row[0])) trainset.append([float(e) for e in row[1:]]) classifier = knn.KNN(K) classifier.train(trainset[:N_TRAIN], trainlabels[:N_TRAIN]) def evalClassifier(individual): labels = classifier.predict(trainset[N_TRAIN:], individual) return sum(x == y for x, y in zip(labels, trainlabels[N_TRAIN:])) / float(len(trainlabels[N_TRAIN:])), \ sum(individual) / float(classifier.ndim) creator.create("FitnessMulti", base.Fitness, weights=(1.0, -1.0)) creator.create("Individual", list, fitness=creator.FitnessMulti) toolbox = base.Toolbox() # Attribute generator toolbox.register("attr_bool", random.randint, 0, 1)
] # %% LEARN = False if LEARN: ks = [1, 2, 4, 7, 11, 16, 22, 29, 37, 46, 56, 79, 106, 121, 151, 199] for k in ks: for dist_f in dist_fs: for kernel_f in kernel_fs: result = np.zeros((n_classes, n_classes)) margins = [] for i, sample in x_train.iterrows(): model = knn.KNN(x_train.drop(i), y_train.drop(i), k=k, dist_f=dist_f, kernel_f=kernel_f) predict_y, margin = model.run(sample, y_train[i]) margins.append(margin) result[predict_y][y_train[i]] += 1 print("{:6} : k={:3}, dist={:8}, kernel={:8}".format( kernels.f_score(result)[0], k, dist_f.__name__, kernel_f.__name__)) ds = [0.01, 0.1, 0.5, 1.0, 2.0, 4.0, 6.0, 8.0, 12.0] for d in ds: for dist_f in dist_fs: for kernel_f in kernel_fs: result = np.zeros((n_classes, n_classes)) margins = []
(trainX, valX, trainY, valY) = train_test_split(trainX_a, trainY_a, test_size=.125, random_state=0) """ initialize class knn.KNN with k=1 Find the best k-value this is a for loop to go k=1:max_k """ Ypred_val = [] for i in range(1, max_k + 1): print("running validation vs. test for k =", i) #load class knn to find best value for k=i knn_val = knn.KNN(i) #load validation-set as training data into model knn_val.train(np.asarray(trainX), np.asarray(trainY)) #get the prediction Ypred_val.append(knn_val.predict(np.asarray(valX))) """ This will evaluate the different k values for l1 and l2 to determine the most accurate value for k http://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html """ best_k = np.zeros((max_k, 2), dtype=float) report_val_l1 = [] report_val_l2 = [] """
def keyPressEvent(self, event): k = event.key() # load model if k == Qt.Key_L: self.knn_clf = knn.KNN(None, None) self.knn_clf.clf = ModelUtils.load_model('./model/model1.m') print("加载模型成功") return # save model if k == Qt.Key_S: if self.knn_clf.clf == None: print("模型为None,不能保存") return ModelUtils.save_model(self.knn_clf.clf, './model/model1.m') print("已保存模型") return # fit if k == Qt.Key_T: self.knn_clf = knn.KNN(self.Data.X, self.Data.y, is_pca = False) self.knn_clf.fitWithoutPca() # self.knn_clf = knn.KNN() # self.knn_clf.fitWithoutPca(self.Data.X, self.Data.y) return # 数据转为训练集并,显示X和y的shape if k == Qt.Key_W: self.Data.get_X_y() print(self.Data.X.shape) print(self.Data.y.shape) return # 显示数据个数 if k == Qt.Key_E: print(len(self.Data.sub_imgs)) print(len(self.Data.target)) return # 清屏 if k == Qt.Key_Q: self.clearScreen() self.update() return # 构造数据 if k == Qt.Key_Space: if self.knn_clf != None: print("模型已经存在,不需要构造") return x = self.pos().x() y = self.pos().y() x = x + 10 y = y + 50 h, w = self.height() - 20, self.width() screen = QApplication.primaryScreen() pix = screen.grabWindow(0, x, y, w, h) pix.save("draw.jpg") # numbers = GetNumber.read_img() self.Data.getNumber() # self.show_number(numbers) return # 识别 if k == Qt.Key_F: x = self.pos().x() y = self.pos().y() x = x + 10 y = y + 50 h, w = self.height() - 20, self.width() screen = QApplication.primaryScreen() pix = screen.grabWindow(0, x, y, w, h) pix.save("draw.jpg") numbers = GetNumber.read_img(self.knn_clf.clf) self.show_number(numbers)
# coding=utf-8 #khai bao duong dan xuong Bus import sys sys.path.append('../Bus/') #khai bao cac ham duoi Bus import knn import naiveBayes as nb import svm #su dung ham duoi Bus #knn print(knn.KNN()) print(knn._2dDraw(['education', 'spouse_occupation'], 1300)) #nb print(nb._GaussianNB()[0]) print(nb._GaussianNB()[1]) print(nb._MultinomialNB()[0]) print(nb._MultinomialNB()[1]) print(nb._BernoulliNB()[0]) print(nb._BernoulliNB()[1]) #svm print(svm._SVM()[0]) print(svm._SVM()[1]) print(svm._LinearSVC()[0]) print(svm._LinearSVC()[1])
import numpy as np import knn import navie_bayes as nb import decision_tree as dt import random_forrest as rf import boosting as bt print('Loading data.txt...') data = np.loadtxt('data.txt') trainin = data[:, :-1] trainout = data[:, -1] print('classify with KNN') knn1 = knn.KNN(k=3) knn1.train(trainin, trainout) knn1.test(cross_fold=10) print('\n') print('classify with Navie Bayes') nb1 = nb.NB() nb1.train(trainin, trainout) nb1.test(cross_fold=10) print('\n') print('classify with Decision Tree') dt1 = dt.DT(N=5) dt1.train(trainin, trainout) dt1.test(cross_fold=10) print('\n') print('classify with Random Forrest') rf1 = rf.RF(N=5, NTree=5) rf1.train(trainin, trainout) rf1.test(cross_fold=10) print('\n') print('classify with Boosting')
def readCommand(argv): """Processes the command used to run from the command line.""" from optparse import OptionParser parser = OptionParser(USAGE_STRING) parser.add_option('-r', '--run', help=default('automatically runs training and test cycle for 5 times'), default= False, action='store_true') parser.add_option('-c', '--classifier', help=default('The type of classifier'), choices=['mostFrequent', 'naiveBayes', 'perceptron', 'knn'], default='mostFrequent') parser.add_option('-d', '--data', help=default('Dataset to use'), choices=['digits', 'faces'], default='digits') parser.add_option('-t', '--training', help=default('The ratio of the training set to use'), default=1.0, type="float") parser.add_option('-f', '--features', help=default('Whether to use enhanced features'), default=False, action="store_true") parser.add_option('-o', '--odds', help=default('Whether to compute odds ratios'), default=False, action="store_true") parser.add_option('-1', '--label1', help=default("First label in an odds ratio comparison"), default=0, type="int") parser.add_option('-2', '--label2', help=default("Second label in an odds ratio comparison"), default=1, type="int") parser.add_option('-k', '--smoothing', help=default("Smoothing parameter (ignored when using --autotune)"), type="float", default=2.0) parser.add_option('-a', '--autotune', help=default("Whether to automatically tune hyperparameters"), default=False, action="store_true") parser.add_option('-i', '--iterations', help=default("Maximum iterations to run training"), default=3, type="int") options, otherjunk = parser.parse_args(argv) if len(otherjunk) != 0: raise Exception('Command line input not understood: ' + str(otherjunk)) args = {} # Set up variables according to the command line input. print("Doing classification") print("--------------------") print("data:\t\t" + options.data) print("classifier:\t\t" + options.classifier) print("using enhanced features?:\t" + str(options.features)) if options.data == "digits": printImage = ImagePrinter(DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT).printImage if options.features: featureFunction = enhancedFeatureExtractorDigit else: featureFunction = basicFeatureExtractorDigit elif options.data == "faces": printImage = ImagePrinter(FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT).printImage if options.features: featureFunction = enhancedFeatureExtractorFace else: featureFunction = basicFeatureExtractorFace else: print("Unknown dataset", options.data) print(USAGE_STRING) sys.exit(2) if options.data == "digits": legalLabels = range(10) else: legalLabels = range(2) if options.training <= 0: print("Training set size should be a positive integer (you provided: %d)" % options.training) print(USAGE_STRING) sys.exit(2) if options.smoothing <= 0: print("Please provide a positive number for smoothing (you provided: %f)" % options.smoothing) print(USAGE_STRING) sys.exit(2) if options.odds: if options.label1 not in legalLabels or options.label2 not in legalLabels: print("Didn't provide a legal labels for the odds ratio: (%d,%d)" % (options.label1, options.label2)) print(USAGE_STRING) sys.exit(2) if options.classifier == "mostFrequent": classifier = mostFrequent.MostFrequentClassifier(legalLabels) elif options.classifier == "naiveBayes": classifier = naiveBayes.NaiveBayesClassifier(legalLabels) classifier.setSmoothing(options.smoothing) if options.autotune: print "using automatic tuning for naivebayes" classifier.automaticTuning = True else: print("using smoothing parameter k=%f for naivebayes" % options.smoothing) elif options.classifier == "perceptron": classifier = perceptron.PerceptronClassifier(legalLabels, options.iterations) elif options.classifier == "knn": classifier = knn.KNN(legalLabels) else: print("Unknown classifier:", options.classifier) print(USAGE_STRING) sys.exit(2) args['classifier'] = classifier args['featureFunction'] = featureFunction args['printImage'] = printImage return args, options
digits_train, digits_test = utils.get_deskew_imgs( digits_train), utils.get_deskew_imgs(digits_test) holes_train, holes_test = utils.get_hole_features( digits_train), utils.get_hole_features(digits_test) pix_train, pix_test = utils.get_pix_features( digits_train), utils.get_pix_features(digits_test) X_train, X_test = np.hstack([pix_train, holes_train]), np.hstack([pix_test, holes_test]) mean_normalizer = utils.normalization(X_train) X_train = mean_normalizer.transform(X_train) X_test = mean_normalizer.transform(X_test) mx_score = 0 best = (-1, -1) clf = knn.KNN(mode='weighted') for n_component in range(3, 61, 3): for k in range(1, 11): _pca = pca.PCA(X_train) X_train_reduced = _pca.transform(X_train, n_component) X_test_reduced = _pca.transform(X_test, n_component) start_time = timeit.default_timer() validation_scores = [] kf = KFold(n_splits=10) for t_idx, v_idx in kf.split(X_train_reduced): X_train_T, X_train_V = X_train_reduced[t_idx], X_train_reduced[ v_idx] y_train_T, y_train_V = y_train[t_idx], y_train[v_idx] clf.fit(X_train_T, y_train_T) validation_score = clf.score(X_train_V, y_train_V, k)
#This script will show as an example of the use of a KNN and SVM learners #Created by Elijah Flinders #svm setup and training print("*********************************************************************") print("Creating and testing SVM on it's own dataset. Support Vector Machine") print("*********************************************************************") svm = svm.SVM(10000, 0.000001) svm.fit() print("Finished running the SVM!\n") #KNN setup and prediction print("************************************************") print("Creating and testing KNN. K-th Nearest Neighbor") print("************************************************") knnTester = knn.KNN() # load the Iris data set and convert to specific type dataset = knnTester.loadCsvListKnn('iris.csv') for i in range(len(dataset[0]) - 1): knnTester.colToFloat(dataset, i) # convert columns to ints knnTester.colToInt(dataset, len(dataset[0]) - 1) # define number of model neighbors and set record neighbors = 5 testSetosa = [4.5, 2.3, 1.3, 0.3] testVersicolor = [7.0, 3.2, 4.7, 1.4] testVirginica = [6.3, 3.3, 6.0, 2.5] # try to predict labels for each type
import knn import util test = knn.KNN() data = util.openFile("data/iris.csv") print(test.knnRun(data, 5))