def process(traind, trainc, testd, testc): filename = "new_result.csv" num = 6 #number of exemplar to use neigh = 3 #number of neighbor print "reading data" trdata = data.Data(traind) trc = data.Data(trainc) tedata = data.Data(testd) tec = data.Data(testc) A = trdata.get_data(trdata.get_headers()).T traincats = trc.get_data([trc.get_headers()[0]]).T print "building knn" knn = classifiers.KNN(dataObj=trdata, headers=trdata.get_headers(), categories=traincats, K=num) cats, labels = knn.classify(A, K=neigh) conf = knn.confusion_matrix(traincats, cats) print knn.confusion_matrix_str(conf) B = tedata.get_data(tedata.get_headers()).T testcats = tec.get_data([tec.get_headers()[0]]).T cats, labels = knn.classify(B, K=neigh) conf = knn.confusion_matrix(testcats, cats) print knn.confusion_matrix_str(conf)
def KNN_classify_complete(): # read the training and test sets dtrain = data.Data("wordPrimeTrain.csv") dtest = data.Data("wordPrimeTest.csv") A = dtrain.get_columns(dtrain.get_headers()[:-1]) B = dtest.get_columns(dtest.get_headers()[:-1]) traincats = dtrain.get_columns(["category"]) testcats = dtest.get_columns(["category"]) # create a new classifier nbc = classifiers.KNN() # build the classifier using the training data nbc.build(A, traincats, 5) # use the classifier on the training data print "complete KNN, confusion matrix\n" print "on train data\n" ctraincats, ctrainlabels = nbc.classify(A) confusion = nbc.confusion_matrix(traincats, ctrainlabels) print nbc.confusion_matrix_str(confusion) print "on test data\n" ctestcats, ctestlabels = nbc.classify(B) confusion = nbc.confusion_matrix(testcats, ctestlabels) print nbc.confusion_matrix_str(confusion)
def KNN_classify_partial(): # read the training and test sets dtrain = data.Data("wordPrimeTrain.csv") dtest = data.Data("wordPrimeTest.csv") A = dtrain.get_columns( ["word_dist", "pron_dist", "Target_Freq_N", "cue_Freq_N"]) B = dtest.get_columns( ["word_dist", "pron_dist", "Target_Freq_N", "cue_Freq_N"]) traincats = dtrain.get_columns(["category"]) testcats = dtest.get_columns(["category"]) # create a new classifier nbc = classifiers.KNN() # build the classifier using the training data nbc.build(A, traincats, 5) # use the classifier on the training data print "partial KNN, confusion matrix\n" print "on train data\n" ctraincats, ctrainlabels = nbc.classify(A) confusion = nbc.confusion_matrix(traincats, ctrainlabels) print nbc.confusion_matrix_str(confusion) print "on test data\n" ctestcats, ctestlabels = nbc.classify(B) confusion = nbc.confusion_matrix(testcats, ctestlabels) print nbc.confusion_matrix_str(confusion)
def process(traind, trainc, testd, testc, write=True, K=10): filename = "results_knn.csv" print("Reading data") train_file = traind test_file = testd dtrain = data.Data(train_file) dtest = data.Data(test_file) train_headers = dtrain.get_headers() test_headers = dtrain.get_headers() traincat_file = trainc testcat_file = testc traincats = data.Data(traincat_file) traincatdata = traincats.all_rows_specified_columns( traincats.get_headers()) testcats = data.Data(testcat_file) testcatdata = testcats.all_rows_specified_columns(testcats.get_headers()) uniquelabels, correctedtraincats = numpy.unique(traincatdata.T.tolist()[0], return_inverse=True) correctedtraincats = numpy.matrix([correctedtraincats]).T uniquelabels, correctedtestcats = numpy.unique(testcatdata.T.tolist()[0], return_inverse=True) correctedtestcats = numpy.matrix([correctedtestcats]).T print('Building KNN Classifier') knnc = classifiers.KNN(dtrain, train_headers, traincatdata, K) print('KNN Training Set Results') A = dtrain.all_rows_specified_columns(train_headers) newcats, newlabels = knnc.classify(A) confmtx = knnc.confusion_matrix(correctedtraincats, newcats) print(knnc.confusion_matrix_str(confmtx)) print('KNN Test Set Results') A = dtest.all_rows_specified_columns(test_headers) newcats, newlabels = knnc.classify(A) # print the confusion matrix confmtx = knnc.confusion_matrix(correctedtestcats, newcats) print(knnc.confusion_matrix_str(confmtx)) dtest.addColumn("Category", "numeric", newcats.T.A[0]) # if you want to write the test results in a csv file if write: dtest.write(filename, headers=dtest.get_headers()) return dtest
def training(argv): # Reads in a training set and its category labels, possibly as a separate file. # read the training and test sets #print "Reading: \n Training: %s\n Test: %s\n KNN/NB: %s\n " % (argv[1], argv[2], argv[-1]) trainData = data.Data(argv[0]) testData = data.Data(argv[1]) #test data headerList = [1,2] headerList[0] = trainData.getHeaderRaw() headerList[1] = testData.getHeaderRaw() # print trainData # print testData headers = [] #header names for cmtx # get the categories and the training data A and the test data B if len(argv) > 4: traincatdata = data.Data(argv[2]) testcatdata = data.Data(argv[3]) # needs to be a list traincats = traincatdata.getDataNum( [traincatdata.getHeaderRaw()[0]] ) testcats = testcatdata.getDataNum( [testcatdata.getHeaderRaw()[0]] ) A = trainData.getDataNum( trainData.getHeaderRaw() ) B = testData.getDataNum( testData.getHeaderRaw() ) else: # assume the categories are the last columnlen traincats = trainData.getDataNum( [trainData.getHeaderRaw()[-1]] ) testcats = testData.getDataNum( [testData.getHeaderRaw()[-1]] ) A = trainData.getDataNum( trainData.getHeaderRaw()[:-1] ) B = testData.getDataNum( testData.getHeaderRaw()[:-1] ) if argv[-1] == "NaiveBayes": classifier = classifiers.NaiveBayes() else: classifier = classifiers.KNN() print "this may take a little while..." classifier.build( A, traincats) ctraincats, ctrainlabels = classifier.classify( A ) ctestcats, ctestlabels = classifier.classify( B ) #print tabulate(classifier.confusionMatrixStr(classifier.confusionMatrix(testcats, ctestlabels), headerList[1])) trainDataStr = classifier.confusionMatrix(traincats, ctrainlabels) testDataStr = classifier.confusionMatrix(testcats, ctestlabels) print "done training" return trainDataStr, testDataStr, traincats.T.tolist()[0], testcats.T.tolist()[0], trainData, testData
def main(argv): '''Builds two KNN classifiers and prints them out. The first uses all of the exemplars, the second uses only 10. ''' # # usage # if len(argv) < 2: # print 'Usage: python %s <data file> <optional category file>' % (argv[0]) # exit(-1) # # # read the data # d = data.Data(argv[1]) d = data.Data("iris_proj8_all.csv") # get the categories and data matrix if len(argv) > 2: catdata = data.Data(argv[2]) cats = catdata.get_data( [catdata.get_headers()[0]] ) A = d.get_data( d.get_headers() ) else: # assume the categories are the last column cats = d.get_data( [d.get_headers()[-1]] ) A = d.get_data( d.get_headers()[:-1] ) # create a new classifier knnc = classifiers.KNN() # build the classifier using all exemplars knnc.build( A, cats ) # print the classifier # requires a __str__ method print knnc # build and print the classifier using 10 exemplars per class knnc2 = classifiers.KNN() knnc2.build( A, cats, 10 ) print knnc2 return
def main(argv): if len(argv) < 3: print 'Usage: python %s <training data file> <test data file> <optional training category file> <optional test category file>' % ( argv[0]) exit(-1) # read the training and test sets dtrain = data.Data(argv[1]) dtest = data.Data(argv[2]) # get the categories and the training data A and the test data B if len(argv) > 4: traincatdata = data.Data(argv[3]) testcatdata = data.Data(argv[4]) traincats = traincatdata.get_data([traincatdata.get_headers()[0]]) testcats = testcatdata.get_data([testcatdata.get_headers()[0]]) A = dtrain.get_data(dtrain.get_headers()) B = dtest.get_data(dtest.get_headers()) else: # assume the categories are the last column traincats = dtrain.get_data([dtrain.get_headers()[-1]]) testcats = dtest.get_data([dtest.get_headers()[-1]]) A = dtrain.get_data(dtrain.get_headers()[:-1]) B = dtest.get_data(dtest.get_headers()[:-1]) userChoice = raw_input( "Which classifier would you like to use?\n[n] for Naive Bayes and [k] for KNN: " ) if userChoice.lower() == 'n': classifier = classifiers.NaiveBayes() elif userChoice.lower() == 'k': classifier = classifiers.KNN() else: print "type in valid classifier type" return # build classfier wwith training set categories classifier.build(A, traincats) # classify training set catsTrain, labelsTrain = classifier.classify(A) # print out traiing set confusion matrix classifier.confusion_matrix_str( classifier.confusion_matrix(traincats, catsTrain)) # classify test set catsTest, labelsTest = classifier.classify(B) # print out test set confusion matrix classifier.confusion_matrix_str( classifier.confusion_matrix(testcats, catsTest)) # add category column and write to csv dtest.addColumn(['category', 'numeric'] + catsTest.T.tolist()[0]) dtest.writeToCSV('categorizedData.csv', dtest.get_raw_headers())
def build_classifier(training_data, training_labels, method): if method == "Naive Bayes" or method == "naivebayes": nbc = classifiers.NaiveBayes() nbc.build(training_data, training_labels) return nbc elif method == "K-Nearest Neighbors" or method == "knn": knn = classifiers.KNN() knn.build(training_data, training_labels) return knn else: print "Uknown method: Use 'knn' or 'naivebayes'" exit(-1)
def export_classifiers(): trained = util.load_pickle(name='fs_1', path='..\\pickles\\feature_sets\\') print('trained', size(trained)) test = util.load_pickle(name='fs_test_1', path='..\\pickles\\test_features\\') print('test', size(test)) test_data = test['data_set'] featureset = 'fs_words_bigrams_pos' X_train, y_train = trained[featureset], trained['labels'] X_test, y_test = test[featureset], test['labels'] feat_size = X_train.shape[1] knn = c.KNN(X_test=X_test.toarray(), y_test=y_test) nb = c.NB(X_test=X_test, y_test=y_test) dt = c.DT(X_test=X_test, y_test=y_test) rf = c.RF(X_test=X_test, y_test=y_test) xgb = c.XGB(X_test=X_test, y_test=y_test) svm = c.SVM(X_test=X_test, y_test=y_test) nn = c.NN(X_test=X_test, y_test=y_test) mc = c.MC(X_test=test_data, y_test=y_test) knn.fit(X_train.toarray(), y_train, params={'leaf_size': 100, 'n_jobs': -1, 'n_neighbors': 55, 'p': 3}) nb.fit(X_train, y_train, params={'alpha': 1.5}) dt.fit(X_train, y_train, params={'max_depth': 8, 'min_samples_leaf': 3}) rf.fit(X_train, y_train, params={'min_samples_leaf': 20, 'n_estimators': 500, 'n_jobs': -1}) xgb.fit(X_train, y_train, params={'learning_rate': 0.125, 'max_depth': 10, 'n_estimators': 400}) svm.fit(X_train, y_train, params={'C': 2, 'kernel': 'linear', 'probability': True}) nn.fit(X_train, y_train, params={'epochs': 10, 'layers': [Dropout(0.5, input_shape=(feat_size,)), Dense(50, kernel_initializer='normal', activation='relu', kernel_constraint=maxnorm(3)), Dropout(0.5), Dense(50, kernel_initializer='normal', activation='sigmoid'), Dropout(0.25), Dense(1, kernel_initializer='normal', activation='sigmoid')]}) mc.fit(X_train=X_train, y_train=y_train) clfs = { 'knn': knn, 'nb': nb, 'dt': dt, 'rf': rf, 'xgb': xgb, 'svm': svm, 'nn': nn, 'mc': mc } return clfs
def main(argv): # usage if len(argv) < 6: print( "Usage: python3 %s <model.h5> <greek_data.csv> <greek_training_labels.csv> <greek_test_data.csv> <greek_test_labels.csv>" % argv[0]) exit() # load the model as an embedding space print("Loading model from %s" % argv[1]) model = load_model(argv[1]) embedding_model = Model(inputs=model.input, outputs=model.layers[-3].output) # embedding_model.summary() # read training data and training labels greek_training_data_input = read_data(argv[2]) greek_training_data_input = greek_training_data_input.astype('float32') greek_training_data_input /= 255 greek_training_data_input = np.expand_dims(greek_training_data_input, axis=3) print("training input shape: ", greek_training_data_input.shape) greek_training_data_output = embedding_model.predict( greek_training_data_input) print("training output shape: ", greek_training_data_output.shape) # read in labels greek_training_labels = read_labels(argv[3]) # reading testing data and testing labels (Mike's handwritten greek letters) greek_testing_data_input = read_data(argv[4]) greek_testing_data_input = greek_testing_data_input.astype('float32') greek_testing_data_input /= 255 greek_testing_data_input = np.expand_dims(greek_testing_data_input, axis=3) print("testing input shape: ", greek_testing_data_input.shape) greek_testing_data_output = embedding_model.predict( greek_testing_data_input) print("testing output shape: ", greek_testing_data_output.shape) # read in labels greek_testing_labels = read_labels(argv[5]) idx2letter = {0: "alpha", 1: "beta", 2: "gamma"} # # test ssd # print("Testing SSD") # print("Calculating ssd with respect to alpha (idx 1)") # alpha_exp = greek_training_data_output[1,:] # alpha_ssd = ssd(alpha_exp, greek_training_data_output) # alpha_argsort = np.argsort(alpha_ssd) # for i in alpha_argsort: # print("idx: %2d; label: %s; ssd: %.2f"%(i, idx2letter[greek_training_labels[i]], alpha_ssd[i])) # # print("Calculating ssd with respect to beta (idx 0)") # beta_exp = greek_training_data_output[0,:] # beta_ssd = ssd(beta_exp, greek_training_data_output) # beta_argsort = np.argsort(beta_ssd) # for i in beta_argsort: # print("idx: %2d; label: %s; ssd: %.2f"%(i, idx2letter[greek_training_labels[i]], beta_ssd[i])) # # print("Calculating ssd with respect to gamma (idx 4)") # gamma_exp = greek_training_data_output[4,:] # gamma_ssd = ssd(gamma_exp, greek_training_data_output) # gamma_argsort = np.argsort(gamma_ssd) # for i in gamma_argsort: # print("idx: %2d; label: %s; ssd: %.2f"%(i, idx2letter[greek_training_labels[i]], gamma_ssd[i])) # test KNN classifier print("Testing KNN classifier") training_cats = np.matrix(greek_training_labels).T testing_cats = np.matrix(greek_testing_labels).T K = 3 print('Building KNN Classifier (K=%d)' % K) knnc = classifiers.KNN(greek_training_data_output, training_cats, K) print('KNN Training Set Results') newcats, newlabels = knnc.classify(greek_training_data_output) confmtx = knnc.confusion_matrix( np.matrix(greek_training_labels).T, newcats) print(knnc.confusion_matrix_str(confmtx)) print('KNN Test Set Results') newcats, newlabels, d = knnc.classify(greek_testing_data_output, True) # print the confusion matrix confmtx = knnc.confusion_matrix(testing_cats, newcats) print(knnc.confusion_matrix_str(confmtx))
def buildClassifier(trainFile, testFile, tCats=None, ttCats=None, classType="NaiveBayes", save=False, K=None): ''' Code inspired by Bruce's code ''' dtrain = data.Data(trainFile) dtest = data.Data(testFile) if (tCats != None and ttCats != None): traincatdata = data.Data(tCats) traincats = traincatdata.get_data([traincatdata.get_headers()[0]], traincatdata.get_num_rows()) testcatdata = data.Data(ttCats) testcats = testcatdata.get_data([testcatdata.get_headers()[0]], testcatdata.get_num_rows()) A = dtrain.get_data(dtrain.get_headers(), dtrain.get_num_rows()) B = dtest.get_data(dtest.get_headers(), dtest.get_num_rows()) else: # assume the categories are the last column traincats = dtrain.get_data([dtrain.get_headers()[-1]], dtrain.get_num_rows()) testcats = dtest.get_data([dtest.get_headers()[-1]], dtest.get_num_rows()) A = dtrain.get_data(dtrain.get_headers()[:-1], dtrain.get_num_rows()) B = dtest.get_data(dtest.get_headers()[:-1], dtest.get_num_rows()) #default is a naiveBayes Classifier nbc = classifiers.NaiveBayes() if (classType == "KNN"): if K != None: nbc = classifiers.KNN(K=K) nbc.build(A, traincats) ctraincats, ctrainlabels = nbc.classify(A) ctestcats, ctestlabels = nbc.classify(B) else: #default K of 3 nbc = classifiers.KNN(K=3) nbc.build(A, traincats) ctraincats, ctrainlabels = nbc.classify(A) ctestcats, ctestlabels = nbc.classify(B) else: # build the classifier using the training data nbc.build(A, traincats) # use the classifier on the training data ctraincats, ctrainlabels = nbc.classify(A) ctestcats, ctestlabels = nbc.classify(B) if save == True: ctestcats.tofile('cTestCats.csv', sep=" ", format="%s") ctestlabels.tofile('cTestLabels.csv', sep=" ", format="%s") print "Training Data" print nbc.confusion_matrix_str(nbc.confusion_matrix(traincats, ctraincats)) print "Test Data" print nbc.confusion_matrix_str(nbc.confusion_matrix(testcats, ctestcats)) return nbc
def main(argv): time = datetime.datetime.now() # test function here if len(argv) < 4 or (argv[3] != 'k' and argv[3] != 'n'): print( 'Usage: python %s <training data file> <test data file> <n for Naive Bayes, k for KNN> <optional training categories file> <optional test categories file>' % (argv[0])) print( ' If categories are not provided as separate files, then the last column is assumed to be the category.' ) exit(-1) train_file = argv[1] test_file = argv[2] knn = True if argv[3] == 'k' else False dtrain = data.Data(train_file) dtest = data.Data(test_file) if len(argv) >= 6: train_headers = dtrain.get_headers() test_headers = dtrain.get_headers() traincat_file = argv[4] testcat_file = argv[5] traincats = data.Data(traincat_file) traincatdata = traincats.limit_columns(traincats.get_headers()) testcats = data.Data(testcat_file) testcatdata = testcats.limit_columns(testcats.get_headers()) else: train_headers = dtrain.get_headers()[:-1] test_headers = dtrain.get_headers()[:-1] traincatdata = dtrain.limit_columns([dtrain.get_headers()[-1]]) testcatdata = dtest.limit_columns([dtest.get_headers()[-1]]) uniquelabels, correctedtraincats = np.unique( traincatdata.T.tolist()[0], return_inverse=True) correctedtraincats = np.matrix([correctedtraincats]).T uniquelabels, correctedtestcats = np.unique(testcatdata.T.tolist()[0], return_inverse=True) correctedtestcats = np.matrix([correctedtestcats]).T if not knn: nbc = classifiers.NaiveBayes(dtrain, train_headers, traincatdata) print('Naive Bayes Training Set Results') A = dtrain.limit_columns(train_headers) newcats, newlabels = nbc.classify(A) traincats = newcats print('making confusion matrix') confmtx = nbc.confusion_matrix(correctedtraincats, newcats) print(nbc.confusion_matrix_str(confmtx)) print('Naive Bayes Test Set Results') for i in range(len(test_headers)): try: test_headers[i] = int(test_headers[i]) except: break A = dtest.limit_columns(test_headers) print('classifying with naive bayes classifier') newcats, newlabels = nbc.classify(A) print('confusion matrix') confmtx = nbc.confusion_matrix(correctedtestcats, newcats) print(nbc.confusion_matrix_str(confmtx)) else: print('knn') print('-----------------') print('Building KNN Classifier') knnc = classifiers.KNN(dtrain, train_headers, traincatdata, 3) print('KNN Training Set Results') A = dtrain.limit_columns(train_headers) newcats, newlabels = knnc.classify(A) traincats = newcats confmtx = knnc.confusion_matrix(correctedtraincats, newcats) print(knnc.confusion_matrix_str(confmtx)) print('KNN Test Set Results') A = dtest.limit_columns(test_headers) newcats, newlabels = knnc.classify(A) print('KNN TEST::Correct labels\n', correctedtestcats.T) print('KNN TEST:::Predicted labels\n', newcats) # print the confusion matrix confmtx = knnc.confusion_matrix(correctedtestcats, newcats) print(knnc.confusion_matrix_str(confmtx)) test_headers.append('predicted') dtest.add_header2col('predicted') dtest.add_column(newcats.T) dtest.write("heresyourdata.csv", test_headers) return
def main(argv): # Reads in a training set and its category labels, possibly as a separate file. # usage if len(argv) < 3: print "usage: python %s <Training File> <Test File> <opt: Training Categories> <opt: Test Categories> <KNN or NaiveBayes>" % ( argv[0]) return # read the training and test sets print "Reading: \n Training: %s\n Test: %s\n KNN/NB: %s\n " % ( argv[1], argv[2], argv[-1]) trainData = data.Data(argv[1]) testData = data.Data(argv[2]) #test data headerList = [1, 2] headerList[0] = trainData.getHeaderRaw() headerList[1] = testData.getHeaderRaw() # print trainData # print testData headers = [] #header names for cmtx # get the categories and the training data A and the test data B if len(argv) > 4: traincatdata = data.Data(argv[3]) testcatdata = data.Data(argv[4]) # needs to be a list traincats = traincatdata.getDataNum([traincatdata.getHeaderRaw()[0]]) testcats = testcatdata.getDataNum([testcatdata.getHeaderRaw()[0]]) A = trainData.getDataNum(trainData.getHeaderRaw()) B = testData.getDataNum(testData.getHeaderRaw()) else: # assume the categories are the last columnlen traincats = trainData.getDataNum([trainData.getHeaderRaw()[-1]]) testcats = testData.getDataNum([testData.getHeaderRaw()[-1]]) A = trainData.getDataNum(trainData.getHeaderRaw()[:-1]) B = testData.getDataNum(testData.getHeaderRaw()[:-1]) if argv[-1] == "NaiveBayes": classifier = classifiers.NaiveBayes() else: classifier = classifiers.KNN() classifier.build(A, traincats) ctraincats, ctrainlabels = classifier.classify(A) # print ctrainlabels[:20] # # # print traincats[:20] print "Training Data" print tabulate( classifier.confusionMatrixStr( classifier.confusionMatrix(traincats, ctrainlabels), headerList[0])) trainData.addCol("codes", "numeric", traincats.T.tolist()[0]) #print "data: ", trainData.getDataNum(["Training Cats"]) f = open('datasets/trainData.csv', 'w') trainData.writeOut(f, trainData.getHeaderRaw(), "numeric") print "\n" classifier.confusionMatrixGraphic( classifier.confusionMatrix(traincats, ctrainlabels), headerList[0], title="Confusion Matrix of Training Data") print "Test Data" ctestcats, ctestlabels = classifier.classify(B) print tabulate( classifier.confusionMatrixStr( classifier.confusionMatrix(testcats, ctestlabels), headerList[1])) testData.addCol("Test Cats", "numeric", testcats.T.tolist()[0]) #print "data: ", testData.getDataNum(["Training Cats"]) f = open('datasets/testData.csv', 'w') testData.writeOut(f, testData.getHeaderRaw(), "numeric") print "\n" classifier.confusionMatrixGraphic(classifier.confusionMatrix( testcats, ctestlabels), headerList[1], title="Confusion Matrix of Test Data")
embedding_featureset.append(vector) embedding_labels.append(embedded[i + 1]) assert len(embedding_featureset) == len( embedding_labels), "Did not get equal amount of predictions as points" fraction = 8.0 / 10.0 train_set = (embedding_featureset[:int(fraction * len(embedding_featureset))], embedding_labels[:int(fraction * len(embedding_featureset))]) test_set = (embedding_featureset[int(fraction * len(embedding_featureset)):], embedding_labels[int(fraction * len(embedding_featureset)):]) X_train, y_train = train_set[0], train_set[1] X_test, y_test = test_set[0], test_set[1] knn = classifiers.KNN(X_train, y_train, k=5) prediction = [] truth = [] train = copy.copy(X_train) for i in range(len(X_train)): xx, yy = train[i], y_train[i] prediction_embedded = knn.classify(xx) prediction.append(prediction_embedded[0]) truth.append(yy[0]) # Calculate the average error training = [row[0] for row in X_train] print("Error on training set: {0}".format( knn.error(prediction, truth, training)))
def main(argv): #usage if len(argv) < 4: print 'Usage: python %s <training data file> <test data file> <nb or knn> <optional training category file> <optional test category file>' % ( argv[0]) exit(-1) #store classifier type classifier = argv[3] if classifier != 'nb' and classifier != 'knn': print 'Usage: python %s <training data file> <test data file> <nb or knn> <optional training category file> <optional test category file>' % ( argv[0]) exit(-1) print '\nReading data files' #read the training and test sets dtrain = data.Data(argv[1]) dtest = data.Data(argv[2]) #get the categories and the training data train and the test data test if len(argv) > 5: traincatdata = data.Data(argv[4]) testcatdata = data.Data(argv[5]) traincats = traincatdata.get_data([traincatdata.get_headers()[0]]) testcats = testcatdata.get_data([testcatdata.get_headers()[0]]) train = dtrain.get_data(dtrain.get_headers()) test = dtest.get_data(dtest.get_headers()) headers = dtest.get_headers() else: #assume the categories are the last column traincats = dtrain.get_data([dtrain.get_headers()[-1]]) testcats = dtest.get_data([dtest.get_headers()[-1]]) train = dtrain.get_data(dtrain.get_headers()[:-1]) test = dtest.get_data(dtest.get_headers()[:-1]) headers = dtest.get_headers()[:-1] #create classifier using training set if classifier == 'knn': #get k k = raw_input( 'How many nearest neighbors? (default=3) Type number then press enter: ' ) if k == '': k = 3 else: k = abs(int(k)) #make new KNN classifier knntrain = classifiers.KNN() print '\nTraining the classifier' # build the classifier from training set knntrain.build(train, traincats, k) print '\nClassifying training data' # classify training set print confusion matrix trainCat, trainLab = knntrain.classify(train) print '\nBuilding training confusion matrix' traincmat = knntrain.confusion_matrix(traincats, trainCat) print knntrain.confusion_matrix_str(traincmat) print '\nClassifying testing data' # classify test set and print confusion matrix testCat, testLab = knntrain.classify(test) print '\nBuilding testing confusion matrix' testcmat = knntrain.confusion_matrix(testcats, testCat) print knntrain.confusion_matrix_str(testcmat) #write test data set and categories to CSV file filename = raw_input('Type filename for test data, then press enter: ') print '\nSaving test data' dtest.addColumn('Categories', 'numeric', testCat.T.tolist()[0]) headers.append('Categories') dtest.write(filename, headers) else: # classifier is nb #make new naive bayes classifier nbtrain = classifiers.NaiveBayes() print '\nTraining the classifier' # build the classifier from training set nbtrain.build(train, traincats) print '\nClassifying training data' # classify training set print confusion matrix trainCat, trainLab = nbtrain.classify(train) print '\nBuilding training confusion matrix' traincmat = nbtrain.confusion_matrix(traincats, trainCat) print nbtrain.confusion_matrix_str(traincmat) print '\nClassifying testing data' # classify test set and print confusion matrix testCat, testLab = nbtrain.classify(test) print '\nBuilding testing confusion matrix' testcmat = nbtrain.confusion_matrix(testcats, testCat) print nbtrain.confusion_matrix_str(testcmat) #write test data set and categories to CSV file filename = raw_input('Type filename for test data, then press enter: ') print '\nSaving test data' dtest.addColumn('Categories', 'numeric', testCat.T.tolist()[0]) headers.append('Categories') dtest.write(filename, headers)
X_train, y_train, X_test, y_test) print("The accuracy of Linear Regression is: {:.2f} %".format( accuracies_LinR.mean() * 100)) print("Standard Deviation of Linear Regression is {:.2f} %".format( accuracies_LinR.std() * 100)) #Logostic Regresion cn_LR, accuracy_LR, accuracies_LR = classifiers.Logistic_Regression( X_train, y_train, X_test, y_test) print("The accuracy of Logistic Regression is: {:.2f} %".format( accuracies_LR.mean() * 100)) print("Standard Deviation of Logistic Regression is {:.2f} %".format( accuracies_LR.std() * 100)) #KNN cn_KNN, accuracy_KNN, accuracies_KNN = classifiers.KNN(X_train, y_train, X_test, y_test) print("The accuracy of KNN is: {:.2f} %".format(accuracies_KNN.mean() * 100)) print("Standard Deviation of KNN is {:.2f} %".format(accuracies_KNN.std() * 100)) #SVM cn_SVM, accuracy_SVM, accuracies_SVM = classifiers.SVM(X_train, y_train, X_test, y_test) print("The accuracy of SVM is: {:.2f} %".format(accuracies_SVM.mean() * 100)) print("Standard Deviation of SVM is {:.2f} %".format(accuracies_SVM.std() * 100)) #Naive Bayes cn_GNB, accuracy_GNB, accuracies_GNB = classifiers.Naive_Bayes( X_train, y_train, X_test, y_test) print("The accuracy of Naive Bayes is: {:.2f} %".format(accuracies_GNB.mean() *
def main(argv): # usage if len(argv) < 3: print 'Usage: python %s <classtype> <training data file> <test data file> <optional training category file> <optional test category file>' % ( argv[0]) exit(-1) # read the training and test sets dtrain = data.Data(argv[2]) dtest = data.Data(argv[3]) # get the categories and the training data A and the test data B if len(argv) > 5: traincatdata = data.Data(argv[4]) testcatdata = data.Data(argv[5]) traincats = traincatdata.get_data([traincatdata.get_headers()[0]]) testcats = testcatdata.get_data([testcatdata.get_headers()[0]]) A = dtrain.get_data(dtrain.get_headers()) B = dtest.get_data(dtest.get_headers()) else: # assume the categories are the last column traincats = dtrain.get_data([dtrain.get_headers()[-1]]) testcats = dtest.get_data([dtest.get_headers()[-1]]) A = dtrain.get_data(dtrain.get_headers()[:-1]) B = dtest.get_data(dtest.get_headers()[:-1]) if (argv[1] == "KNN"): print "You chose KNN" #create knn classifier knnc = classifiers.KNN() #build knn classifier knnc.build(A, traincats) trainclasscats, trainclasslabels = knnc.classify(A) testclasscats, testclasslabels = knnc.classify(B) #use KNN classifier on test data traincmtx = knnc.confusion_matrix((traincats), (trainclasscats)) traincmtxstr = knnc.confusion_matrix_str(traincmtx) print "Training Confusion Matrix" print traincmtxstr testcmtx = knnc.confusion_matrix(testcats, testclasscats) testcmtxstr = knnc.confusion_matrix_str(testcmtx) print "Testing Confusion Matrix" print testcmtxstr elif (argv[1] == "Naive-Bayes"): print "You chose Naive-Bayes" # create Naive-Bayes classifier nbc = classifiers.NaiveBayes() # build Naive-Bayes classifier nbc.build(A, traincats) # use Naive-Bayes classifier on test data trainclasscats, trainclasslabels = nbc.classify(A) testclasscats, testclasslabels = nbc.classify(B) # use KNN classifier on test data traincmtx = nbc.confusion_matrix(traincats, trainclasscats) traincmtxstr = nbc.confusion_matrix_str(traincmtx) print "Training Data Confusion Matrix" print traincmtxstr testcmtx = nbc.confusion_matrix(testcats, testclasscats) testcmtxstr = nbc.confusion_matrix_str(testcmtx) print "Test Data Confusion Matrix" print testcmtxstr dtest.addColumn("Classifiers", testclasscats) dtest.write("writtendatafile.csv")
def main(argv): ''' Reads in a training set and a test set and builds a KNN classifer. Prints out confusion matrices and writes classifications for test data to a CSV file. ''' # usage if len(argv) < 3: print 'Usage: python %s <training data file> <test data file> <optional training category file> <optional test category file>' % ( argv[0]) exit(-1) # read the training and test sets dtrain = data.Data(argv[1]) dtest = data.Data(argv[2]) # get the categories and the training data A and the test data B if len(argv) > 4: traincatdata = data.Data(argv[3]) testcatdata = data.Data(argv[4]) traincats = traincatdata.get_data([traincatdata.get_headers()[0]]) testcats = testcatdata.get_data([testcatdata.get_headers()[0]]) A = dtrain.get_data(dtrain.get_headers()) B = dtest.get_data(dtest.get_headers()) else: # assume the categories are the last column traincats = dtrain.get_data([dtrain.get_headers()[-1]]) testcats = dtest.get_data([dtest.get_headers()[-1]]) A = dtrain.get_data(dtrain.get_headers()[:-1]) B = dtest.get_data(dtest.get_headers()[:-1]) # create KNN classifier knnc = classifiers.KNN() # build the classifier using the training data knnc.build(A, traincats) # use the classifier on the training data knnctraincats, knnctrainlabels = knnc.classify(A) print "For KNN (training data):" print knnc.confusion_matrix_str( knnc.confusion_matrix(traincats, knnctraincats)) # use the classifier on the test data knnctestcats, knnctestlabels = knnc.classify(B) print "For KNN (test data):" print knnc.confusion_matrix_str( knnc.confusion_matrix(testcats, knnctestcats)) # write test data to csv knncfile = open("knncOut.csv", 'w') writeFile = csv.writer(knncfile) if len(argv) > 4: knncHeaders = dtest.get_headers() else: knncHeaders = dtest.get_headers()[:-1] knncHeaders.append("Category") writeFile.writerow(knncHeaders) writeFile.writerow(["numeric"] * len(knncHeaders)) for i in range(B.shape[0]): rowList = B[i, :].tolist() rowList[0].append(knnctestcats[i, 0]) writeFile.writerow(rowList[0]) knncfile.close() return
m = composite.FeatureSelect(s, featsel.RFE()) r = m.cv(d, 3) fs = featsel.FeatureScore('golub') f = featsel.Filter(fs, sigma=2) m = composite.FeatureSelect(s, f) r = m.cv(d, 3) d = datafunc.SparseDataSet('heart.data') p = modelSelection.Param(svm.SVM(), 'C', [0.1, 1, 10, 100, 1000]) m = modelSelection.ModelSelector(p) m.train(d) d = datafunc.SparseDataSet('heartSparse.data') p = modelSelection.Param(classifiers.KNN(), 'k', [1, 2, 3, 5, 10, 15]) m = modelSelection.ModelSelector(p) m.train(d) r = p.cv(d, numFolds=10) results = [r for r in p.cv(d, numFolds=10)] results = [r.successRate for r in p.cv(d, numFolds=10)] d = datafunc.SparseDataSet('yeast.data', labelsColumn=0) d = datafunc.SparseDataSet('yeast2.data', labelsColumn=1) from PyML import * d = datafunc.VectorDataSet('yeast3.data', labelsColumn=1)
def main(argv): '''Reads in a training set and a test set and builds two KNN classifiers. One uses all of the data, one uses 10 exemplars. Then it classifies the test data and prints out the results. first part , reading in two input files, code is inspired by Bruce's code ''' # usage if len(argv) < 3: print 'Usage: python %s <training data file> <test data file> <optional training category file> <optional test category file>' % ( argv[0]) exit(-1) """ Bruce KNN test code source starts here, with comments for my understanding """ # read the training and test sets dtrain = data.Data(argv[1]) dtest = data.Data(argv[2]) # get the categories and the training data A and the test data B if len(argv) > 4: traincatdata = data.Data(argv[3]) testcatdata = data.Data(argv[4]) traincats = traincatdata.get_data([traincatdata.get_headers()[0]]) testcats = testcatdata.get_data([testcatdata.get_headers()[0]]) A = dtrain.get_data(dtrain.get_headers()) B = dtest.get_data(dtest.get_headers()) else: # assume the categories are the last column traincats = dtrain.get_data([dtrain.get_headers()[-1] ]) # training categories testcats = dtest.get_data([dtest.get_headers()[-1]]) # test categories A = dtrain.get_data(dtrain.get_headers()[:-1]) # train data matrice B = dtest.get_data(dtest.get_headers()[:-1]) # test data matrice # for float categories, turn them into ints new = [] if type(traincats[0]) == float: for t in traincats: new.append(int(t)) traincats = new new = [] if type(testcats[0]) == float: new = [] for t in testcats: new.append(int(t)) testcats = new # create two classifiers, one using 10 exemplars per class knncall = classifiers.KNN() knnc10 = classifiers.KNN() #print type(type(traincats)) # build the classifiers given data and categories knncall.build(A, traincats) knnc10.build(A, traincats, 10) # specify K # use the classifiers on the test data, to try classify A classcats, alllabels = knncall.classify(A) tencats, tenlabels = knnc10.classify(A) """ #Bruce KNN test edited for my project code source ends here """ # Classify the training set and print out a confusion matrix. # build confusion matrix and print it out confusion_matrix = knncall.confusion_matrix(traincats, classcats) # # print out the confusion matrix cmtx = knncall.confusion_matrix_str(confusion_matrix) #print classcats print " train set confusion matrix \n ", cmtx # Classify the test set and print out a confusion matrix. # use the classifiers on the test data, to try classify B classcats, alllabels = knncall.classify(B) tencats, tenlabels = knnc10.classify(B) # build confusion matrix and print it out confusion_matrix = knncall.confusion_matrix(testcats, classcats) # # print out the confusion matrix cmtx = knncall.confusion_matrix_str(confusion_matrix) #print classcats print " test set confusion matrix \n ", cmtx return
def main(argv): '''Reads in a training set and a test set and builds two KNN classifiers. One uses all of the data, one uses 10 exemplars. Then it classifies the test data and prints out the results. ''' # usage if len(argv) < 3: print( 'Usage: python %s <training data file> <test data file> <optional training category file> <optional test category file>' % (argv[0])) exit(-1) # read the training and test sets dtrain = data.Data(argv[1]) dtest = data.Data(argv[2]) # get the categories and the training data A and the test data B if len(argv) > 4: traincatdata = data.Data(argv[3]) testcatdata = data.Data(argv[4]) traincats = traincatdata.get_data([traincatdata.get_headers()[0]]) testcats = testcatdata.get_data([testcatdata.get_headers()[0]]) A = dtrain.get_data(dtrain.get_headers()) B = dtest.get_data(dtest.get_headers()) else: # assume the categories are the last column traincats = dtrain.get_data([dtrain.get_headers()[-1]]) testcats = dtest.get_data([dtest.get_headers()[-1]]) A = dtrain.get_data(dtrain.get_headers()[:-1]) B = dtest.get_data(dtest.get_headers()[:-1]) # create two classifiers, one using 10 exemplars per class knncall = classifiers.KNN() knnc10 = classifiers.KNN() # build the classifiers knncall.build(A, traincats) knnc10.build(A, traincats, 10) # use the classifiers on the test data allcats, alllabels = knncall.classify(B) tencats, tenlabels = knnc10.classify(B) # print the results print('Results using All Exemplars:') print(' True Est') for i in range(allcats.shape[0]): if int(testcats[i, 0]) == int(alllabels[i, 0]): print("%03d: %4d %4d" % (i, int(testcats[i, 0]), int(alllabels[i, 0]))) else: print("%03d: %4d %4d **" % (i, int(testcats[i, 0]), int(alllabels[i, 0]))) print(knnc10) print('Results using 10 Exemplars:') print(' True Est') for i in range(tencats.shape[0]): if int(testcats[i, 0]) == int(tenlabels[i, 0]): print("%03d: %4d %4d" % (i, int(testcats[i, 0]), int(tenlabels[i, 0]))) else: print("%03d: %4d %4d **" % (i, int(testcats[i, 0]), int(tenlabels[i, 0]))) return
def test_classifier(classifier, X, y): print("###### Testing " + classifier.value + " ######") worst_hit_rate = np.inf best_hit_rate = 0 best_y_pred = np.array(0) worst_y_pred = np.array(0) best_y_test = np.array(0) worst_y_test = np.array(0) for j in range(5): print("*** Round {}:".format(j + 1)) X_train, X_test, y_train, y_test = partition(X, y, 0.7) n_e = list(y_test).count(0) n_p = list(y_test).count(1) y_pred = np.empty([len(y_test)]) true_positives_p = 0 # True positives class "p" true_positives_e = 0 # True positives class "e" true_positives = 0 for i in range(len(X_test)): if classifier is Classifier.KNN: y_pred[i] = classifiers.KNN(X_test[i], X_train, y_train) elif classifier is Classifier.MDC: y_pred[i] = classifiers.MDC(X_test[i], X_train, y_train) elif classifier is Classifier.QC: y_pred[i] = classifiers.QC(X_test[i], X_train, y_train) if y_pred[i] == y_test[i]: true_positives += 1 if y_pred[i] == 0: true_positives_e += 1 else: true_positives_p += 1 print("True positives: {} from {} samples".format( true_positives, len(y_test))) print("True positives e: {} from {} samples".format( true_positives_e, n_e)) print("True positives p: {} from {} samples".format( true_positives_p, n_p)) hit_rate = true_positives / len(y_test) hit_rate_e = true_positives_e / n_e hit_rate_p = true_positives_p / n_p print("Hit rate: {}".format(hit_rate)) print("Hit rate e: {}".format(hit_rate_e)) print("Hit rate p: {}".format(hit_rate_p)) if hit_rate > best_hit_rate: best_y_pred = y_pred best_y_test = y_test best_hit_rate = hit_rate if hit_rate < worst_hit_rate: worst_y_pred = y_pred worst_y_test = y_test worst_hit_rate = hit_rate print("Best result: {}".format(best_hit_rate)) print("Confusion matrix best result: ") cnf_matrix_best = confusion_matrix(best_y_test, best_y_pred) print(cnf_matrix_best) print("Worst result: {}".format(worst_hit_rate)) print("Confusion matrix worst result: ") cnf_matrix_worst = confusion_matrix(worst_y_test, worst_y_pred) print(cnf_matrix_worst)
def main(argv): # usage if len(argv) < 4: print("Usage: python3 %s <data.csv> <metadata.csv> <0 - KNN; 1 -ANN>" % argv[0]) exit() datafilename = argv[1] metadatafilename = argv[2] classifierType = int(argv[3]) # read data datamat = np.genfromtxt(datafilename, delimiter=',') # print(datamat) # print(datamat.shape) # numdata = 3000 numdata = datamat.shape[0] datamat = datamat[:numdata, :] data = datamat[:, 1:].astype(np.float32) # print(data) # print(data.shape) # read labels labelsmat, dict = readlabels(metadatafilename) # print(dict) # print(labelsmat) inv_dict = {v: k for k, v in dict.items()} labelsmat = labelsmat[:numdata, :] # print(labelsmat.shape) unique, counts = np.unique(labelsmat, return_counts=True, axis=0) # print(unique) # print(counts) ############################################# # get top 25 labels top25idx = np.argsort(counts)[::-1].tolist()[:25] top25idx = unique[top25idx, :].T.tolist()[0] # print(top25idx) data_top25 = [] labels_top25 = [] for i in range(data.shape[0]): if labelsmat[i, 0] in top25idx: labels_top25.append(labelsmat[i, 0]) data_top25.append(data[i, :]) data_top25 = np.matrix(data_top25) labels_top25 = np.matrix(labels_top25).T # print(data_top25.shape) # print(len(labels_top25)) unique_top25, inverse_top25, counts_top25 = np.unique(labels_top25, return_counts=True, return_inverse=True, axis=0) # print(unique_top25) # print(counts_top25) print("Top 25 Labels:") for i in range(unique_top25.shape[0]): print(i, " : ", inv_dict[unique_top25[i, 0]], ", ", counts_top25[i]) # ##################################### ANN ############# if classifierType == 1: print("******************ANN classifier:**************") print(data_top25.shape[0]) # print(inverse_top25.shape) data_top25_train = [] data_top25_test = [] labels_top25_train = [] labels_top25_test = [] for i in range(labels_top25.shape[0]): if np.random.random() > 0.2: labels_top25_train.append(inverse_top25[i]) data_top25_train.append(data_top25[i, :]) else: labels_top25_test.append(inverse_top25[i]) data_top25_test.append(data_top25[i, :]) data_top25_train = np.vstack(data_top25_train) labels_top25_train = np.matrix(labels_top25_train).T data_top25_test = np.vstack(data_top25_test) labels_top25_test = np.matrix(labels_top25_test).T # print("text saved") # np.savetxt("../results/types.csv", labels_top25_train, delimiter=",", fmt="%.6f") # nnc = classifiers.NeuralNet(data_top25_train, labels_top25_train) # nnc.train() # print("NN training done") # test_data = data # test_cats = labelsmat # print("training data") nnc = classifiers.NeuralNet(data_top25_test, labels_top25_test) print("NN testing data") test_new_cats = nnc.classify(data_top25_test) print(labels_top25_test.shape) print("NN fisnished prediction") print(nnc.accuracy(labels_top25_test, test_new_cats)) ############################## elif classifierType == 0: print("******************KNN classifier:**************") # split training testing print(data_top25.shape[0]) # print(inverse_top25.shape) data_top25_train = [] data_top25_test = [] labels_top25_train = [] labels_top25_test = [] for i in range(labels_top25.shape[0]): if np.random.random() > 0.2: labels_top25_train.append(inverse_top25[i]) data_top25_train.append(data_top25[i, :]) else: labels_top25_test.append(inverse_top25[i]) data_top25_test.append(data_top25[i, :]) data_top25_train = np.vstack(data_top25_train) labels_top25_train = np.matrix(labels_top25_train).T data_top25_test = np.vstack(data_top25_test) labels_top25_test = np.matrix(labels_top25_test).T # print(data_top25_train.shape) # print(labels_top25_train.shape) # print(data_top25_test.shape) # print(labels_top25_test.shape) # # print(data_top25_train) # print(labels_top25_train) # print(data_top25_test) # print(labels_top25_test) # # CLASSIFY K = 7 print('Building KNN Classifier (K=%d)' % K) knnc = classifiers.KNN(data_top25_train, labels_top25_train, K) print('KNN Training Set Results') newcats, newlabels = knnc.classify(data_top25_train) accuracy = knnc.accuracy(labels_top25_train, newlabels) print("Training accuracy", accuracy) confmtx = knnc.confusion_matrix(labels_top25_train, newlabels) plt.matshow(confmtx) plt.title("Training: %d data; %.4f accruacy." % (labels_top25_train.shape[0], accuracy)) plt.savefig("../results/training.png", dpi=300) print(knnc.confusion_matrix_str(confmtx)) print('KNN Test Set Results') newcats, newlabels = knnc.classify(data_top25_test) accuracy = knnc.accuracy(labels_top25_test, newlabels) print("Testing accuracy", accuracy) # print the confusion matrix confmtx = knnc.confusion_matrix(labels_top25_test, newlabels) plt.matshow(confmtx) plt.title("Testing: %d data; %.4f accruacy." % (labels_top25_test.shape[0], accuracy)) plt.savefig("../results/testing.png", dpi=300) print(knnc.confusion_matrix_str(confmtx)) else: print("invalid argv[-1]") print( "Usage: python3 %s <data.csv> <metadata.csv> <0 - KNN; 1 - ANN>" % argv[0]) sys.exit()
def classify(trainingSet, testSet, bayes=True, optrainingCats=None, optestCats=None, outputFile="KNN.csv"): print("in classify") dtrain = data.Data(trainingSet) dtest = data.Data(testSet) if optrainingCats != None: trainHeaders = dtrain.get_headers() trainCats = data.Data(optrainingCats) trainCatsData = trainCats.newMatrix(trainCats.get_headers()) else: trainHeaders = dtrain.get_headers()[:-1] trainCatsData = dtrain.newMatrix([dtrain.get_headers()[-1]]) if optestCats != None: testHeaders = dtrain.get_headers() testCats = data.Data(optestCats) testCatsData = testCats.newMatrix(testCats.get_headers()) else: testHeaders = dtrain.get_headers()[:-1] testCatsData = dtest.newMatrix([dtest.get_headers()[-1]]) if bayes: nbc = classifiers.NaiveBayes(dtrain, trainHeaders, trainCatsData) print('Naive Bayes Training Set Results') A = dtrain.newMatrix(trainHeaders) newcats, newlabels = nbc.classify(A) uniquelabels, correctedtraincats = np.unique( trainCatsData.T.tolist()[0], return_inverse=True) correctedtraincats = np.matrix([correctedtraincats]).T confmtx = nbc.confusion_matrix(correctedtraincats, newcats) print(nbc.confusion_matrix_str(confmtx)) print('Naive Bayes Test Set Results') A = dtest.newMatrix(testHeaders) newcats, newlabels = nbc.classify(A) uniquelabels, correctedtestcats = np.unique(testCatsData.T.tolist()[0], return_inverse=True) correctedtestcats = np.matrix([correctedtestcats]).T confmtx = nbc.confusion_matrix(correctedtestcats, newcats) print(nbc.confusion_matrix_str(confmtx)) with open(outputFile, mode='w') as file: dataToWrite = A.tolist() writer = csv.writer(file) testHeaders.append("predicted categories") writer.writerow(testHeaders) writer.writerow(["numeric" for i in range(len(testHeaders))]) for i in range(len(dataToWrite)): dataToWrite[i].append(newcats[i, 0]) writer.writerow(dataToWrite[i]) else: print('Building KNN Classifier') knnc = classifiers.KNN(dtrain, trainHeaders, trainCatsData, 5) print('KNN Training Set Results') A = dtrain.newMatrix(trainHeaders) newcats, newlabels = knnc.classify(A) uniquelabels, correctedtraincats = np.unique( trainCatsData.T.tolist()[0], return_inverse=True) correctedtraincats = np.matrix([correctedtraincats]).T confmtx = knnc.confusion_matrix(correctedtraincats, newcats) print(knnc.confusion_matrix_str(confmtx)) print('KNN Test Set Results') A = dtest.newMatrix(testHeaders) newcats, newlabels = knnc.classify(A) uniquelabels, correctedtestcats = np.unique(testCatsData.T.tolist()[0], return_inverse=True) correctedtestcats = np.matrix([correctedtestcats]).T # print the confusion matrix confmtx = knnc.confusion_matrix(correctedtestcats, newcats) print(knnc.confusion_matrix_str(confmtx)) with open(outputFile, mode='w') as file: dataToWrite = A.tolist() writer = csv.writer(file) testHeaders.append("predicted categories") writer.writerow(testHeaders) writer.writerow(["numeric" for i in range(len(testHeaders))]) for i in range(len(dataToWrite)): dataToWrite[i].append(newcats[i, 0]) writer.writerow(dataToWrite[i])