def get_test_label_list(train_file_route, test_file_route, algorithm, classifier): if algorithm == 'tf_idf': train_feature_sparse_matrix, train_label_list = algo.tf_idf( train_file_route) test_feature_sparse_matrix, test_label_list = algo.tf_idf( test_file_route) if algorithm == 'tf_dc': train_feature_sparse_matrix, train_label_list = algo.tf_dc( train_file_route) test_feature_sparse_matrix, test_label_list = algo.tf_dc( test_file_route) else: train_feature_sparse_matrix, train_label_list = algo.tf_bdc( train_file_route) test_feature_sparse_matrix, test_label_list = algo.tf_bdc( test_file_route) if classifier == 'KNN': predict_test_label_list = clf.KNN(train_feature_sparse_matrix, train_label_list, test_feature_sparse_matrix) else: predict_test_label_list = clf.SVM(train_feature_sparse_matrix, train_label_list, test_feature_sparse_matrix) return test_label_list, predict_test_label_list
def main(argv): '''Builds two KNN classifiers and prints them out. The first uses all of the exemplars, the second uses only 10. ''' # usage if len(argv) < 2: print 'Usage: python %s <data file> <optional category file>' % ( argv[0]) exit(-1) # read the data d = data.Data(argv[1]) # get the categories and data matrix if len(argv) > 2: catdata = data.Data(argv[2]) cats = catdata.get_data([catdata.get_headers()[0]]) A = d.get_data(d.get_headers()) else: # assume the categories are the last column cats = d.get_data([d.get_headers()[-1]]) A = d.get_data(d.get_headers()[:-1]) # create a new classifier knnc = classifier.KNN() # build the classifier using all exemplars knnc.build(A, cats) # print the classifier # requires a __str__ method print knnc # build and print the classifier using 10 exemplars per class knnc2 = classifier.KNN() knnc2.build(A, cats, 10) print knnc2 return
def getRating(tweet): #pre process useableTweetDict = preProcess(tweet) #build feature vector fv = getFeatureVector(useableTweetDict) #load feature vectors of trained data trainSetFV, trainLabels = loadTrainSetFV() #array of arrays, array #load the knn classifier knn = classifier.KNN(k=8) #train the classifier knn.train(trainSetFV, trainLabels) #classify the instance rating = knn.predict(fv) return rating
def main(argv): '''Reads in a training set and a test set and builds two KNN classifiers. One uses all of the data, one uses 10 exemplars. Then it classifies the test data and prints out the results. ''' # usage if len(argv) < 3: print 'Usage: python %s <training data file> <test data file> <optional training category file> <optional test category file>' % (argv[0]) exit(-1) # read in the training set data_train = data.Data(argv[1]) # read in the test set data_test = data.Data(argv[2]) # compatibility check length or argv if len(argv) > 4: # get the categories of the training data train_cat_data = data.Data(argv[3]) train_cats = train_cat_data.get_data( [train_cat_data.get_headers()[0]] ) # get the categories of the test data test_cat_data = data.Data(argv[4]) test_cats = test_cat_data.get_data( [test_cat_data.get_headers()[0]] ) # get the training data A and the test data B A = data_train.get_data( data_train.get_headers() ) B = data_test.get_data( data_test.get_headers() ) else: # just assume the categories are the last column train_cats = data_train.get_data( [data_train.get_headers()[-1]] ) test_cats = data_test.get_data( [data_test.get_headers()[-1]] ) A = data_train.get_data( data_train.get_headers()[:-1] ) B = data_test.get_data( data_test.get_headers()[:-1] ) #----------------------------------------------------------------------- # create two classifiers knnClass = classifier.KNN() print "Created Classifier, Building Now."
def main(argv): '''Reads in a training set and a test set and builds two KNN classifiers. One uses all of the data, one uses 10 exemplars. Then it classifies the test data and prints out the results. ''' # usage if len(argv) < 3: print 'Usage: python %s <training data file> <test data file> <optional training category file> <optional test category file>' % ( argv[0]) exit(-1) # read in the training set data_train = data.Data(argv[1]) # read in the test set data_test = data.Data(argv[2]) # compatibility check length or argv if len(argv) > 4: # get the categories of the training data train_cat_data = data.Data(argv[3]) train_cats = train_cat_data.get_data([train_cat_data.get_headers()[0]]) # get the categories of the test data test_cat_data = data.Data(argv[4]) test_cats = test_cat_data.get_data([test_cat_data.get_headers()[0]]) # get the training data A and the test data B A = data_train.get_data(data_train.get_headers()) B = data_test.get_data(data_test.get_headers()) else: # just assume the categories are the last column train_cats = data_train.get_data([data_train.get_headers()[-1]]) test_cats = data_test.get_data([data_test.get_headers()[-1]]) A = data_train.get_data(data_train.get_headers()[:-1]) B = data_test.get_data(data_test.get_headers()[:-1]) #----------------------------------------------------------------------- # create two classifiers knnClass = classifier.KNN() print "Created Classifier, Building Now." # build the classifiers knnClass.build(A, train_cats) print "Built! Now classifying." #----------------------------------------------------------------------- #-Classifies the training set data and prints out a confusion matrix. acats, alabels = knnClass.classify(A) print "Done Classifying." unique, mapping = np.unique(np.array(train_cats.T), return_inverse=True) unique2, mapping2 = np.unique(np.array(alabels.T), return_inverse=True) mtx = knnClass.confusion_matrix( np.matrix(mapping).T, np.matrix(mapping2).T) print "Training Confusion Matrix:" print knnClass.confusion_matrix_str(mtx) #----------------------------------------------------------------------- #----------------------------------------------------------------------- #-Classifies the test set data and prints out a confusion matrix. bcats, blabels = knnClass.classify(B) print "Done Classifying." unique, mapping = np.unique(np.array(test_cats.T), return_inverse=True) unique2, mapping2 = np.unique(np.array(blabels.T), return_inverse=True) mtx1 = knnClass.confusion_matrix( np.matrix(mapping).T, np.matrix(mapping2).T) print "Test Confusion Matrix:" print knnClass.confusion_matrix_str(mtx1) #----------------------------------------------------------------------- #Writes out a new CSV data file with the test set data # and the categories as an extra column data_test.addColumn("KNN Classification", bcats) data_test.toFile(filename="knnClass.csv") return
Xtrain, Ytrain) test_err = classifier.decision_tree(Xtrain, Ytrain, Xtest, Ytest, depth=max_depth) print "Decision Tree Classifier\n" print "Validation Error = ", val_err print "Training Error = ", train_err print "Testing Error = ", test_err print "Optimal Depth = ", max_depth print "\n" ''' K Nearest Neighbor ''' val_err, train_err, opt_K = classifier.K_Fold_crossValidation_KNN( Xtrain, Ytrain) test_err = classifier.KNN(Xtrain, Ytrain, Xtest, Ytest, K=opt_K) print "K Nearest Neighbor Classifier\n" print "Validation Error = ", val_err print "Training Error = ", train_err print "Testing Error = ", test_err print "Optimal K = ", opt_K print "\n" ''' SVM - linear kernel ''' val_err, train_err, opt_C = classifier.K_Fold_crossValidation_SVM(Xtrain, Ytrain, ker='linear') test_err = classifier.SVM(Xtrain, Ytrain, Xtest, Ytest, ker='linear',
__author__ = "Harshilkumar Patel" __status__ = "Development" import config import classifier from utils import logger import constants data = config.get_training_data() if constants.CLASSIFIER_CHOICE == "knn": Classifier = classifier.KNN(data) else: Classifier = classifier.NaiveBayes(data) logger.debug("formatted data is %s", Classifier.data) result = Classifier.predict(config.get_training_data("input.txt")[1]) logger.debug("THE FINAL PREDICTION is %s", result) # f = open('output.txt', 'w') # f.write(result) # f.close()
def main(argv): '''Reads in a training set and a test set and builds two KNN classifiers. One uses all of the data, one uses 10 exemplars. Then it classifies the test data and prints out the results. ''' # usage if len(argv) < 3: print 'Usage: python %s <training data file> <test data file> <optional training category file> <optional test category file>' % ( argv[0]) exit(-1) # read the training and test sets dtrain = data.Data(argv[1]) dtest = data.Data(argv[2]) # get the categories and the training data A and the test data B if len(argv) > 4: traincatdata = data.Data(argv[3]) testcatdata = data.Data(argv[4]) traincats = traincatdata.get_data([traincatdata.get_headers()[0]]) testcats = testcatdata.get_data([testcatdata.get_headers()[0]]) A = dtrain.get_data(dtrain.get_headers()) B = dtest.get_data(dtest.get_headers()) else: # assume the categories are the last column traincats = dtrain.get_data([dtrain.get_headers()[-1]]) testcats = dtest.get_data([dtest.get_headers()[-1]]) A = dtrain.get_data(dtrain.get_headers()[:-1]) B = dtest.get_data(dtest.get_headers()[:-1]) # create two classifiers, one using 10 exemplars per class knncall = classifier.KNN() knnc10 = classifier.KNN() # build the classifiers knncall.build(A, traincats) knnc10.build(A, traincats, 10) # use the classifiers on the test data allcats, alllabels = knncall.classify(B) tencats, tenlabels = knnc10.classify(B) # print the results print 'Results using All Exemplars:' print ' True Est' for i in range(allcats.shape[0]): if int(testcats[i, 0]) == int(allcats[i, 0]): print "%03d: %4d %4d" % (i, int(testcats[i, 0]), int(allcats[i, 0])) else: print "%03d: %4d %4d **" % (i, int( testcats[i, 0]), int(allcats[i, 0])) print knnc10 print 'Results using 10 Exemplars:' print ' True Est' for i in range(tencats.shape[0]): if int(testcats[i, 0]) == int(tencats[i, 0]): print "%03d: %4d %4d" % (i, int(testcats[i, 0]), int(tencats[i, 0])) else: print "%03d: %4d %4d **" % (i, int( testcats[i, 0]), int(tencats[i, 0])) return