Example #1
0
def process(traind, trainc, testd, testc):
    filename = "new_result.csv"
    num = 6
    #number of exemplar to use

    neigh = 3
    #number of neighbor

    print "reading data"

    trdata = data.Data(traind)
    trc = data.Data(trainc)
    tedata = data.Data(testd)
    tec = data.Data(testc)

    A = trdata.get_data(trdata.get_headers()).T
    traincats = trc.get_data([trc.get_headers()[0]]).T

    print "building knn"

    knn = classifiers.KNN(dataObj=trdata,
                          headers=trdata.get_headers(),
                          categories=traincats,
                          K=num)

    cats, labels = knn.classify(A, K=neigh)

    conf = knn.confusion_matrix(traincats, cats)
    print knn.confusion_matrix_str(conf)

    B = tedata.get_data(tedata.get_headers()).T
    testcats = tec.get_data([tec.get_headers()[0]]).T
    cats, labels = knn.classify(B, K=neigh)
    conf = knn.confusion_matrix(testcats, cats)
    print knn.confusion_matrix_str(conf)
Example #2
0
def KNN_classify_complete():
    # read the training and test sets
    dtrain = data.Data("wordPrimeTrain.csv")
    dtest = data.Data("wordPrimeTest.csv")

    A = dtrain.get_columns(dtrain.get_headers()[:-1])
    B = dtest.get_columns(dtest.get_headers()[:-1])
    traincats = dtrain.get_columns(["category"])
    testcats = dtest.get_columns(["category"])

    # create a new classifier
    nbc = classifiers.KNN()

    # build the classifier using the training data
    nbc.build(A, traincats, 5)

    # use the classifier on the training data
    print "complete KNN, confusion matrix\n"
    print "on train data\n"
    ctraincats, ctrainlabels = nbc.classify(A)
    confusion = nbc.confusion_matrix(traincats, ctrainlabels)
    print nbc.confusion_matrix_str(confusion)

    print "on test data\n"
    ctestcats, ctestlabels = nbc.classify(B)
    confusion = nbc.confusion_matrix(testcats, ctestlabels)
    print nbc.confusion_matrix_str(confusion)
Example #3
0
def KNN_classify_partial():
    # read the training and test sets
    dtrain = data.Data("wordPrimeTrain.csv")
    dtest = data.Data("wordPrimeTest.csv")

    A = dtrain.get_columns(
        ["word_dist", "pron_dist", "Target_Freq_N", "cue_Freq_N"])
    B = dtest.get_columns(
        ["word_dist", "pron_dist", "Target_Freq_N", "cue_Freq_N"])
    traincats = dtrain.get_columns(["category"])
    testcats = dtest.get_columns(["category"])

    # create a new classifier
    nbc = classifiers.KNN()

    # build the classifier using the training data
    nbc.build(A, traincats, 5)

    # use the classifier on the training data
    print "partial KNN, confusion matrix\n"
    print "on train data\n"
    ctraincats, ctrainlabels = nbc.classify(A)
    confusion = nbc.confusion_matrix(traincats, ctrainlabels)
    print nbc.confusion_matrix_str(confusion)

    print "on test data\n"
    ctestcats, ctestlabels = nbc.classify(B)
    confusion = nbc.confusion_matrix(testcats, ctestlabels)
    print nbc.confusion_matrix_str(confusion)
Example #4
0
def process(traind, trainc, testd, testc, write=True, K=10):
    filename = "results_knn.csv"
    print("Reading data")

    train_file = traind
    test_file = testd
    dtrain = data.Data(train_file)
    dtest = data.Data(test_file)

    train_headers = dtrain.get_headers()
    test_headers = dtrain.get_headers()

    traincat_file = trainc
    testcat_file = testc

    traincats = data.Data(traincat_file)
    traincatdata = traincats.all_rows_specified_columns(
        traincats.get_headers())

    testcats = data.Data(testcat_file)
    testcatdata = testcats.all_rows_specified_columns(testcats.get_headers())

    uniquelabels, correctedtraincats = numpy.unique(traincatdata.T.tolist()[0],
                                                    return_inverse=True)
    correctedtraincats = numpy.matrix([correctedtraincats]).T

    uniquelabels, correctedtestcats = numpy.unique(testcatdata.T.tolist()[0],
                                                   return_inverse=True)
    correctedtestcats = numpy.matrix([correctedtestcats]).T

    print('Building KNN Classifier')
    knnc = classifiers.KNN(dtrain, train_headers, traincatdata, K)

    print('KNN Training Set Results')
    A = dtrain.all_rows_specified_columns(train_headers)

    newcats, newlabels = knnc.classify(A)

    confmtx = knnc.confusion_matrix(correctedtraincats, newcats)
    print(knnc.confusion_matrix_str(confmtx))

    print('KNN Test Set Results')
    A = dtest.all_rows_specified_columns(test_headers)

    newcats, newlabels = knnc.classify(A)

    # print the confusion matrix
    confmtx = knnc.confusion_matrix(correctedtestcats, newcats)
    print(knnc.confusion_matrix_str(confmtx))

    dtest.addColumn("Category", "numeric", newcats.T.A[0])

    # if you want to write the test results in a csv file
    if write:
        dtest.write(filename, headers=dtest.get_headers())

    return dtest
Example #5
0
def training(argv):
	# Reads in a training set and its category labels, possibly as a separate file.

	# read the training and test sets
	#print "Reading: \n  Training: %s\n  Test: %s\n  KNN/NB: %s\n  " % (argv[1], argv[2], argv[-1])

	trainData = data.Data(argv[0])
	testData = data.Data(argv[1]) #test data

	headerList = [1,2]
	headerList[0] = trainData.getHeaderRaw()
	headerList[1] = testData.getHeaderRaw()

	# print trainData
	# print testData

	headers = [] #header names for cmtx


	# get the categories and the training data A and the test data B
	if len(argv) > 4:
		traincatdata = data.Data(argv[2])
		testcatdata = data.Data(argv[3])

		# needs to be a list
		traincats = traincatdata.getDataNum( [traincatdata.getHeaderRaw()[0]] )
		testcats = testcatdata.getDataNum( [testcatdata.getHeaderRaw()[0]] )

		A = trainData.getDataNum( trainData.getHeaderRaw() )
		B = testData.getDataNum( testData.getHeaderRaw() )
	else:

		# assume the categories are the last columnlen

		traincats = trainData.getDataNum( [trainData.getHeaderRaw()[-1]] )
		testcats = testData.getDataNum( [testData.getHeaderRaw()[-1]] )
		A = trainData.getDataNum( trainData.getHeaderRaw()[:-1] )
		B = testData.getDataNum( testData.getHeaderRaw()[:-1] )

	if argv[-1] == "NaiveBayes":
		classifier = classifiers.NaiveBayes()
	else:
		classifier = classifiers.KNN()

	print "this may take a little while..."

	classifier.build( A, traincats)
	ctraincats, ctrainlabels = classifier.classify( A )
	ctestcats, ctestlabels = classifier.classify( B )
	#print tabulate(classifier.confusionMatrixStr(classifier.confusionMatrix(testcats, ctestlabels), headerList[1]))

	trainDataStr = classifier.confusionMatrix(traincats, ctrainlabels)
	testDataStr = classifier.confusionMatrix(testcats, ctestlabels)

	print "done training"

	return trainDataStr, testDataStr, traincats.T.tolist()[0], testcats.T.tolist()[0], trainData, testData
Example #6
0
def main(argv):
    '''Builds two KNN classifiers and prints them out.  The first uses all
    of the exemplars, the second uses only 10.

    '''

    # # usage
    # if len(argv) < 2:
    #     print 'Usage: python %s <data file> <optional category file>' % (argv[0])
    #     exit(-1)
    #
    # # read the data
    # d = data.Data(argv[1])

    d = data.Data("iris_proj8_all.csv")

    # get the categories and data matrix
    if len(argv) > 2:
        catdata = data.Data(argv[2])
        cats = catdata.get_data( [catdata.get_headers()[0]] )
        A = d.get_data( d.get_headers() )
    else:
        # assume the categories are the last column
        cats = d.get_data( [d.get_headers()[-1]] )
        A = d.get_data( d.get_headers()[:-1] )

    # create a new classifier
    knnc = classifiers.KNN()

    # build the classifier using all exemplars
    knnc.build( A, cats )

    # print the classifier
    # requires a __str__ method
    print knnc


    # build and print the classifier using 10 exemplars per class
    knnc2 = classifiers.KNN()
    knnc2.build( A, cats, 10 )
    print knnc2

    return
Example #7
0
def main(argv):

    if len(argv) < 3:
        print 'Usage: python %s <training data file> <test data file> <optional training category file> <optional test category file>' % (
            argv[0])
        exit(-1)

    # read the training and test sets
    dtrain = data.Data(argv[1])
    dtest = data.Data(argv[2])

    # get the categories and the training data A and the test data B
    if len(argv) > 4:
        traincatdata = data.Data(argv[3])
        testcatdata = data.Data(argv[4])
        traincats = traincatdata.get_data([traincatdata.get_headers()[0]])
        testcats = testcatdata.get_data([testcatdata.get_headers()[0]])
        A = dtrain.get_data(dtrain.get_headers())
        B = dtest.get_data(dtest.get_headers())
    else:
        # assume the categories are the last column
        traincats = dtrain.get_data([dtrain.get_headers()[-1]])
        testcats = dtest.get_data([dtest.get_headers()[-1]])
        A = dtrain.get_data(dtrain.get_headers()[:-1])
        B = dtest.get_data(dtest.get_headers()[:-1])

    userChoice = raw_input(
        "Which classifier would you like to use?\n[n] for Naive Bayes and [k] for KNN: "
    )

    if userChoice.lower() == 'n':
        classifier = classifiers.NaiveBayes()
    elif userChoice.lower() == 'k':
        classifier = classifiers.KNN()
    else:
        print "type in valid classifier type"
        return

    # build classfier wwith training set categories
    classifier.build(A, traincats)
    # classify training set
    catsTrain, labelsTrain = classifier.classify(A)
    # print out traiing set confusion matrix
    classifier.confusion_matrix_str(
        classifier.confusion_matrix(traincats, catsTrain))
    # classify test set
    catsTest, labelsTest = classifier.classify(B)
    # print out test set confusion matrix
    classifier.confusion_matrix_str(
        classifier.confusion_matrix(testcats, catsTest))

    # add category column and write to csv

    dtest.addColumn(['category', 'numeric'] + catsTest.T.tolist()[0])
    dtest.writeToCSV('categorizedData.csv', dtest.get_raw_headers())
Example #8
0
def build_classifier(training_data, training_labels, method):
    if method == "Naive Bayes" or method == "naivebayes":
        nbc = classifiers.NaiveBayes()
        nbc.build(training_data, training_labels)
        return nbc
    elif method == "K-Nearest Neighbors" or method == "knn":
        knn = classifiers.KNN()
        knn.build(training_data, training_labels)
        return knn
    else:
        print "Uknown method: Use 'knn' or 'naivebayes'"
        exit(-1)
def export_classifiers():
    trained = util.load_pickle(name='fs_1', path='..\\pickles\\feature_sets\\')
    print('trained', size(trained))
    test = util.load_pickle(name='fs_test_1', path='..\\pickles\\test_features\\')
    print('test', size(test))

    test_data = test['data_set']
    featureset = 'fs_words_bigrams_pos'

    X_train, y_train = trained[featureset], trained['labels']
    X_test, y_test = test[featureset], test['labels']
    feat_size = X_train.shape[1]

    knn = c.KNN(X_test=X_test.toarray(), y_test=y_test)
    nb = c.NB(X_test=X_test, y_test=y_test)
    dt = c.DT(X_test=X_test, y_test=y_test)
    rf = c.RF(X_test=X_test, y_test=y_test)
    xgb = c.XGB(X_test=X_test, y_test=y_test)
    svm = c.SVM(X_test=X_test, y_test=y_test)
    nn = c.NN(X_test=X_test, y_test=y_test)
    mc = c.MC(X_test=test_data, y_test=y_test)


    knn.fit(X_train.toarray(), y_train, params={'leaf_size': 100, 'n_jobs': -1, 'n_neighbors': 55, 'p': 3})
    nb.fit(X_train, y_train, params={'alpha': 1.5})
    dt.fit(X_train, y_train, params={'max_depth': 8, 'min_samples_leaf': 3})
    rf.fit(X_train, y_train, params={'min_samples_leaf': 20, 'n_estimators': 500, 'n_jobs': -1})
    xgb.fit(X_train, y_train, params={'learning_rate': 0.125, 'max_depth': 10, 'n_estimators': 400})
    svm.fit(X_train, y_train, params={'C': 2, 'kernel': 'linear', 'probability': True})
    nn.fit(X_train, y_train, params={'epochs': 10, 'layers': [Dropout(0.5, input_shape=(feat_size,)), Dense(50, kernel_initializer='normal', activation='relu', kernel_constraint=maxnorm(3)),
                                                              Dropout(0.5), Dense(50, kernel_initializer='normal', activation='sigmoid'),
                                                              Dropout(0.25), Dense(1, kernel_initializer='normal', activation='sigmoid')]})

    mc.fit(X_train=X_train, y_train=y_train)
    clfs = {
        'knn': knn,
        'nb': nb,
        'dt': dt,
        'rf': rf,
        'xgb': xgb,
        'svm': svm,
        'nn': nn,
        'mc': mc
    }
    return clfs
Example #10
0
def main(argv):

    # usage
    if len(argv) < 6:
        print(
            "Usage: python3 %s <model.h5> <greek_data.csv> <greek_training_labels.csv> <greek_test_data.csv> <greek_test_labels.csv>"
            % argv[0])
        exit()

    # load the model as an embedding space
    print("Loading model from %s" % argv[1])
    model = load_model(argv[1])
    embedding_model = Model(inputs=model.input,
                            outputs=model.layers[-3].output)
    # embedding_model.summary()

    # read training data and training labels
    greek_training_data_input = read_data(argv[2])
    greek_training_data_input = greek_training_data_input.astype('float32')
    greek_training_data_input /= 255
    greek_training_data_input = np.expand_dims(greek_training_data_input,
                                               axis=3)
    print("training input shape: ", greek_training_data_input.shape)

    greek_training_data_output = embedding_model.predict(
        greek_training_data_input)
    print("training output shape: ", greek_training_data_output.shape)

    # read in labels
    greek_training_labels = read_labels(argv[3])

    # reading testing data and testing labels (Mike's handwritten greek letters)
    greek_testing_data_input = read_data(argv[4])
    greek_testing_data_input = greek_testing_data_input.astype('float32')
    greek_testing_data_input /= 255
    greek_testing_data_input = np.expand_dims(greek_testing_data_input, axis=3)
    print("testing input shape: ", greek_testing_data_input.shape)

    greek_testing_data_output = embedding_model.predict(
        greek_testing_data_input)
    print("testing output shape: ", greek_testing_data_output.shape)

    # read in labels
    greek_testing_labels = read_labels(argv[5])

    idx2letter = {0: "alpha", 1: "beta", 2: "gamma"}

    # # test ssd
    # print("Testing SSD")
    # print("Calculating ssd with respect to alpha (idx 1)")
    # alpha_exp = greek_training_data_output[1,:]
    # alpha_ssd = ssd(alpha_exp, greek_training_data_output)
    # alpha_argsort = np.argsort(alpha_ssd)
    # for i in alpha_argsort:
    #     print("idx: %2d; label: %s; ssd: %.2f"%(i, idx2letter[greek_training_labels[i]], alpha_ssd[i]))
    #
    # print("Calculating ssd with respect to beta (idx 0)")
    # beta_exp = greek_training_data_output[0,:]
    # beta_ssd = ssd(beta_exp, greek_training_data_output)
    # beta_argsort = np.argsort(beta_ssd)
    # for i in beta_argsort:
    #     print("idx: %2d; label: %s; ssd: %.2f"%(i, idx2letter[greek_training_labels[i]], beta_ssd[i]))
    #
    # print("Calculating ssd with respect to gamma (idx 4)")
    # gamma_exp = greek_training_data_output[4,:]
    # gamma_ssd = ssd(gamma_exp, greek_training_data_output)
    # gamma_argsort = np.argsort(gamma_ssd)
    # for i in gamma_argsort:
    #     print("idx: %2d; label: %s; ssd: %.2f"%(i, idx2letter[greek_training_labels[i]], gamma_ssd[i]))

    # test KNN classifier
    print("Testing KNN classifier")
    training_cats = np.matrix(greek_training_labels).T
    testing_cats = np.matrix(greek_testing_labels).T

    K = 3
    print('Building KNN Classifier (K=%d)' % K)
    knnc = classifiers.KNN(greek_training_data_output, training_cats, K)

    print('KNN Training Set Results')

    newcats, newlabels = knnc.classify(greek_training_data_output)

    confmtx = knnc.confusion_matrix(
        np.matrix(greek_training_labels).T, newcats)
    print(knnc.confusion_matrix_str(confmtx))

    print('KNN Test Set Results')

    newcats, newlabels, d = knnc.classify(greek_testing_data_output, True)

    # print the confusion matrix
    confmtx = knnc.confusion_matrix(testing_cats, newcats)
    print(knnc.confusion_matrix_str(confmtx))
Example #11
0
def buildClassifier(trainFile,
                    testFile,
                    tCats=None,
                    ttCats=None,
                    classType="NaiveBayes",
                    save=False,
                    K=None):
    '''
        Code inspired by Bruce's code
    '''
    dtrain = data.Data(trainFile)
    dtest = data.Data(testFile)

    if (tCats != None and ttCats != None):
        traincatdata = data.Data(tCats)
        traincats = traincatdata.get_data([traincatdata.get_headers()[0]],
                                          traincatdata.get_num_rows())
        testcatdata = data.Data(ttCats)
        testcats = testcatdata.get_data([testcatdata.get_headers()[0]],
                                        testcatdata.get_num_rows())
        A = dtrain.get_data(dtrain.get_headers(), dtrain.get_num_rows())
        B = dtest.get_data(dtest.get_headers(), dtest.get_num_rows())
    else:
        # assume the categories are the last column
        traincats = dtrain.get_data([dtrain.get_headers()[-1]],
                                    dtrain.get_num_rows())
        testcats = dtest.get_data([dtest.get_headers()[-1]],
                                  dtest.get_num_rows())
        A = dtrain.get_data(dtrain.get_headers()[:-1], dtrain.get_num_rows())
        B = dtest.get_data(dtest.get_headers()[:-1], dtest.get_num_rows())

    #default is a naiveBayes Classifier
    nbc = classifiers.NaiveBayes()
    if (classType == "KNN"):
        if K != None:
            nbc = classifiers.KNN(K=K)
            nbc.build(A, traincats)
            ctraincats, ctrainlabels = nbc.classify(A)
            ctestcats, ctestlabels = nbc.classify(B)
        else:
            #default K of 3
            nbc = classifiers.KNN(K=3)
            nbc.build(A, traincats)
            ctraincats, ctrainlabels = nbc.classify(A)
            ctestcats, ctestlabels = nbc.classify(B)
    else:
        # build the classifier using the training data
        nbc.build(A, traincats)

        # use the classifier on the training data
        ctraincats, ctrainlabels = nbc.classify(A)
        ctestcats, ctestlabels = nbc.classify(B)

    if save == True:
        ctestcats.tofile('cTestCats.csv', sep=" ", format="%s")
        ctestlabels.tofile('cTestLabels.csv', sep=" ", format="%s")

    print "Training Data"
    print nbc.confusion_matrix_str(nbc.confusion_matrix(traincats, ctraincats))
    print "Test Data"
    print nbc.confusion_matrix_str(nbc.confusion_matrix(testcats, ctestcats))

    return nbc
Example #12
0
def main(argv):
    time = datetime.datetime.now()
    # test function here
    if len(argv) < 4 or (argv[3] != 'k' and argv[3] != 'n'):
        print(
            'Usage: python %s <training data file> <test data file> <n for Naive Bayes, k for KNN> <optional training categories file> <optional test categories file>'
            % (argv[0]))
        print(
            '    If categories are not provided as separate files, then the last column is assumed to be the category.'
        )
        exit(-1)

    train_file = argv[1]
    test_file = argv[2]
    knn = True if argv[3] == 'k' else False
    dtrain = data.Data(train_file)
    dtest = data.Data(test_file)

    if len(argv) >= 6:
        train_headers = dtrain.get_headers()
        test_headers = dtrain.get_headers()

        traincat_file = argv[4]
        testcat_file = argv[5]

        traincats = data.Data(traincat_file)
        traincatdata = traincats.limit_columns(traincats.get_headers())

        testcats = data.Data(testcat_file)
        testcatdata = testcats.limit_columns(testcats.get_headers())

    else:
        train_headers = dtrain.get_headers()[:-1]
        test_headers = dtrain.get_headers()[:-1]

        traincatdata = dtrain.limit_columns([dtrain.get_headers()[-1]])
        testcatdata = dtest.limit_columns([dtest.get_headers()[-1]])

        uniquelabels, correctedtraincats = np.unique(
            traincatdata.T.tolist()[0], return_inverse=True)
        correctedtraincats = np.matrix([correctedtraincats]).T

        uniquelabels, correctedtestcats = np.unique(testcatdata.T.tolist()[0],
                                                    return_inverse=True)
        correctedtestcats = np.matrix([correctedtestcats]).T

    if not knn:
        nbc = classifiers.NaiveBayes(dtrain, train_headers, traincatdata)

        print('Naive Bayes Training Set Results')
        A = dtrain.limit_columns(train_headers)

        newcats, newlabels = nbc.classify(A)
        traincats = newcats

        print('making confusion matrix')
        confmtx = nbc.confusion_matrix(correctedtraincats, newcats)

        print(nbc.confusion_matrix_str(confmtx))

        print('Naive Bayes Test Set Results')
        for i in range(len(test_headers)):
            try:
                test_headers[i] = int(test_headers[i])
            except:
                break

        A = dtest.limit_columns(test_headers)

        print('classifying with naive bayes classifier')
        newcats, newlabels = nbc.classify(A)

        print('confusion matrix')
        confmtx = nbc.confusion_matrix(correctedtestcats, newcats)
        print(nbc.confusion_matrix_str(confmtx))

    else:
        print('knn')
        print('-----------------')
        print('Building KNN Classifier')
        knnc = classifiers.KNN(dtrain, train_headers, traincatdata, 3)

        print('KNN Training Set Results')
        A = dtrain.limit_columns(train_headers)

        newcats, newlabels = knnc.classify(A)
        traincats = newcats
        confmtx = knnc.confusion_matrix(correctedtraincats, newcats)
        print(knnc.confusion_matrix_str(confmtx))

        print('KNN Test Set Results')
        A = dtest.limit_columns(test_headers)

        newcats, newlabels = knnc.classify(A)

        print('KNN TEST::Correct labels\n', correctedtestcats.T)
        print('KNN TEST:::Predicted labels\n', newcats)

        # print the confusion matrix
        confmtx = knnc.confusion_matrix(correctedtestcats, newcats)
        print(knnc.confusion_matrix_str(confmtx))

    test_headers.append('predicted')

    dtest.add_header2col('predicted')
    dtest.add_column(newcats.T)
    dtest.write("heresyourdata.csv", test_headers)
    return
Example #13
0
def main(argv):
    # Reads in a training set and its category labels, possibly as a separate file.

    # usage
    if len(argv) < 3:
        print "usage: python %s <Training File> <Test File> <opt: Training Categories> <opt: Test Categories> <KNN or NaiveBayes>" % (
            argv[0])
        return

    # read the training and test sets
    print "Reading: \n  Training: %s\n  Test: %s\n  KNN/NB: %s\n  " % (
        argv[1], argv[2], argv[-1])

    trainData = data.Data(argv[1])
    testData = data.Data(argv[2])  #test data

    headerList = [1, 2]
    headerList[0] = trainData.getHeaderRaw()
    headerList[1] = testData.getHeaderRaw()

    # print trainData
    # print testData

    headers = []  #header names for cmtx

    # get the categories and the training data A and the test data B
    if len(argv) > 4:
        traincatdata = data.Data(argv[3])
        testcatdata = data.Data(argv[4])

        # needs to be a list
        traincats = traincatdata.getDataNum([traincatdata.getHeaderRaw()[0]])
        testcats = testcatdata.getDataNum([testcatdata.getHeaderRaw()[0]])

        A = trainData.getDataNum(trainData.getHeaderRaw())
        B = testData.getDataNum(testData.getHeaderRaw())
    else:

        # assume the categories are the last columnlen

        traincats = trainData.getDataNum([trainData.getHeaderRaw()[-1]])
        testcats = testData.getDataNum([testData.getHeaderRaw()[-1]])
        A = trainData.getDataNum(trainData.getHeaderRaw()[:-1])
        B = testData.getDataNum(testData.getHeaderRaw()[:-1])

    if argv[-1] == "NaiveBayes":
        classifier = classifiers.NaiveBayes()
    else:
        classifier = classifiers.KNN()

    classifier.build(A, traincats)
    ctraincats, ctrainlabels = classifier.classify(A)

    # print ctrainlabels[:20]
    # #
    # print traincats[:20]

    print "Training Data"
    print tabulate(
        classifier.confusionMatrixStr(
            classifier.confusionMatrix(traincats, ctrainlabels),
            headerList[0]))

    trainData.addCol("codes", "numeric", traincats.T.tolist()[0])
    #print "data: ", trainData.getDataNum(["Training Cats"])
    f = open('datasets/trainData.csv', 'w')
    trainData.writeOut(f, trainData.getHeaderRaw(), "numeric")
    print "\n"

    classifier.confusionMatrixGraphic(
        classifier.confusionMatrix(traincats, ctrainlabels),
        headerList[0],
        title="Confusion Matrix of Training Data")

    print "Test Data"
    ctestcats, ctestlabels = classifier.classify(B)
    print tabulate(
        classifier.confusionMatrixStr(
            classifier.confusionMatrix(testcats, ctestlabels), headerList[1]))

    testData.addCol("Test Cats", "numeric", testcats.T.tolist()[0])
    #print "data: ", testData.getDataNum(["Training Cats"])
    f = open('datasets/testData.csv', 'w')
    testData.writeOut(f, testData.getHeaderRaw(), "numeric")
    print "\n"

    classifier.confusionMatrixGraphic(classifier.confusionMatrix(
        testcats, ctestlabels),
                                      headerList[1],
                                      title="Confusion Matrix of Test Data")
Example #14
0
    embedding_featureset.append(vector)
    embedding_labels.append(embedded[i + 1])

assert len(embedding_featureset) == len(
    embedding_labels), "Did not get equal amount of predictions as points"

fraction = 8.0 / 10.0
train_set = (embedding_featureset[:int(fraction * len(embedding_featureset))],
             embedding_labels[:int(fraction * len(embedding_featureset))])
test_set = (embedding_featureset[int(fraction * len(embedding_featureset)):],
            embedding_labels[int(fraction * len(embedding_featureset)):])

X_train, y_train = train_set[0], train_set[1]
X_test, y_test = test_set[0], test_set[1]

knn = classifiers.KNN(X_train, y_train, k=5)

prediction = []
truth = []
train = copy.copy(X_train)
for i in range(len(X_train)):
    xx, yy = train[i], y_train[i]
    prediction_embedded = knn.classify(xx)

    prediction.append(prediction_embedded[0])
    truth.append(yy[0])

# Calculate the average error
training = [row[0] for row in X_train]
print("Error on training set: {0}".format(
    knn.error(prediction, truth, training)))
Example #15
0
def main(argv):
    #usage
    if len(argv) < 4:
        print 'Usage: python %s <training data file> <test data file> <nb or knn> <optional training category file> <optional test category file>' % (
            argv[0])
        exit(-1)

    #store classifier type
    classifier = argv[3]

    if classifier != 'nb' and classifier != 'knn':
        print 'Usage:  python %s <training data file> <test data file> <nb or knn> <optional training category file> <optional test category file>' % (
            argv[0])
        exit(-1)

    print '\nReading data files'

    #read the training and test sets
    dtrain = data.Data(argv[1])
    dtest = data.Data(argv[2])

    #get the categories and the training data train and the test data test
    if len(argv) > 5:
        traincatdata = data.Data(argv[4])
        testcatdata = data.Data(argv[5])

        traincats = traincatdata.get_data([traincatdata.get_headers()[0]])
        testcats = testcatdata.get_data([testcatdata.get_headers()[0]])

        train = dtrain.get_data(dtrain.get_headers())
        test = dtest.get_data(dtest.get_headers())

        headers = dtest.get_headers()
    else:
        #assume the categories are the last column
        traincats = dtrain.get_data([dtrain.get_headers()[-1]])
        testcats = dtest.get_data([dtest.get_headers()[-1]])

        train = dtrain.get_data(dtrain.get_headers()[:-1])
        test = dtest.get_data(dtest.get_headers()[:-1])

        headers = dtest.get_headers()[:-1]

    #create classifier using training set
    if classifier == 'knn':

        #get k
        k = raw_input(
            'How many nearest neighbors? (default=3) Type number then press enter: '
        )
        if k == '':
            k = 3
        else:
            k = abs(int(k))

        #make new KNN classifier
        knntrain = classifiers.KNN()

        print '\nTraining the classifier'
        # build the classifier from training set
        knntrain.build(train, traincats, k)

        print '\nClassifying training data'
        # classify training set print confusion matrix
        trainCat, trainLab = knntrain.classify(train)

        print '\nBuilding training confusion matrix'
        traincmat = knntrain.confusion_matrix(traincats, trainCat)
        print knntrain.confusion_matrix_str(traincmat)

        print '\nClassifying testing data'
        # classify test set and print confusion matrix
        testCat, testLab = knntrain.classify(test)

        print '\nBuilding testing confusion matrix'
        testcmat = knntrain.confusion_matrix(testcats, testCat)
        print knntrain.confusion_matrix_str(testcmat)

        #write test data set and categories to CSV file
        filename = raw_input('Type filename for test data, then press enter: ')

        print '\nSaving test data'
        dtest.addColumn('Categories', 'numeric', testCat.T.tolist()[0])

        headers.append('Categories')

        dtest.write(filename, headers)

    else:  # classifier is nb

        #make new naive bayes classifier
        nbtrain = classifiers.NaiveBayes()

        print '\nTraining the classifier'
        # build the classifier from training set
        nbtrain.build(train, traincats)

        print '\nClassifying training data'
        # classify training set print confusion matrix
        trainCat, trainLab = nbtrain.classify(train)

        print '\nBuilding training confusion matrix'
        traincmat = nbtrain.confusion_matrix(traincats, trainCat)
        print nbtrain.confusion_matrix_str(traincmat)

        print '\nClassifying testing data'
        # classify test set and print confusion matrix
        testCat, testLab = nbtrain.classify(test)

        print '\nBuilding testing confusion matrix'
        testcmat = nbtrain.confusion_matrix(testcats, testCat)
        print nbtrain.confusion_matrix_str(testcmat)

        #write test data set and categories to CSV file
        filename = raw_input('Type filename for test data, then press enter: ')

        print '\nSaving test data'
        dtest.addColumn('Categories', 'numeric', testCat.T.tolist()[0])

        headers.append('Categories')

        dtest.write(filename, headers)
Example #16
0
    X_train, y_train, X_test, y_test)
print("The accuracy of Linear Regression is: {:.2f} %".format(
    accuracies_LinR.mean() * 100))
print("Standard Deviation of Linear Regression is {:.2f} %".format(
    accuracies_LinR.std() * 100))

#Logostic Regresion
cn_LR, accuracy_LR, accuracies_LR = classifiers.Logistic_Regression(
    X_train, y_train, X_test, y_test)
print("The accuracy of Logistic Regression is: {:.2f} %".format(
    accuracies_LR.mean() * 100))
print("Standard Deviation of Logistic Regression is {:.2f} %".format(
    accuracies_LR.std() * 100))

#KNN
cn_KNN, accuracy_KNN, accuracies_KNN = classifiers.KNN(X_train, y_train,
                                                       X_test, y_test)
print("The accuracy of KNN is: {:.2f} %".format(accuracies_KNN.mean() * 100))
print("Standard Deviation of KNN is {:.2f} %".format(accuracies_KNN.std() *
                                                     100))

#SVM
cn_SVM, accuracy_SVM, accuracies_SVM = classifiers.SVM(X_train, y_train,
                                                       X_test, y_test)
print("The accuracy of SVM is: {:.2f} %".format(accuracies_SVM.mean() * 100))
print("Standard Deviation of SVM is {:.2f} %".format(accuracies_SVM.std() *
                                                     100))

#Naive Bayes
cn_GNB, accuracy_GNB, accuracies_GNB = classifiers.Naive_Bayes(
    X_train, y_train, X_test, y_test)
print("The accuracy of Naive Bayes is: {:.2f} %".format(accuracies_GNB.mean() *
def main(argv):
    # usage
    if len(argv) < 3:
        print 'Usage: python %s <classtype> <training data file> <test data file> <optional training category file> <optional test category file>' % (
            argv[0])
        exit(-1)

    # read the training and test sets
    dtrain = data.Data(argv[2])
    dtest = data.Data(argv[3])

    # get the categories and the training data A and the test data B
    if len(argv) > 5:
        traincatdata = data.Data(argv[4])
        testcatdata = data.Data(argv[5])
        traincats = traincatdata.get_data([traincatdata.get_headers()[0]])
        testcats = testcatdata.get_data([testcatdata.get_headers()[0]])
        A = dtrain.get_data(dtrain.get_headers())
        B = dtest.get_data(dtest.get_headers())
    else:
        # assume the categories are the last column
        traincats = dtrain.get_data([dtrain.get_headers()[-1]])
        testcats = dtest.get_data([dtest.get_headers()[-1]])
        A = dtrain.get_data(dtrain.get_headers()[:-1])
        B = dtest.get_data(dtest.get_headers()[:-1])

    if (argv[1] == "KNN"):
        print "You chose KNN"
        #create knn classifier
        knnc = classifiers.KNN()
        #build knn classifier
        knnc.build(A, traincats)
        trainclasscats, trainclasslabels = knnc.classify(A)
        testclasscats, testclasslabels = knnc.classify(B)
        #use KNN classifier on test data
        traincmtx = knnc.confusion_matrix((traincats), (trainclasscats))
        traincmtxstr = knnc.confusion_matrix_str(traincmtx)
        print "Training Confusion Matrix"
        print traincmtxstr
        testcmtx = knnc.confusion_matrix(testcats, testclasscats)
        testcmtxstr = knnc.confusion_matrix_str(testcmtx)
        print "Testing Confusion Matrix"
        print testcmtxstr

    elif (argv[1] == "Naive-Bayes"):
        print "You chose Naive-Bayes"
        # create Naive-Bayes classifier
        nbc = classifiers.NaiveBayes()
        # build Naive-Bayes classifier
        nbc.build(A, traincats)
        # use Naive-Bayes classifier on test data

        trainclasscats, trainclasslabels = nbc.classify(A)
        testclasscats, testclasslabels = nbc.classify(B)
        # use KNN classifier on test data
        traincmtx = nbc.confusion_matrix(traincats, trainclasscats)
        traincmtxstr = nbc.confusion_matrix_str(traincmtx)
        print "Training Data Confusion Matrix"
        print traincmtxstr
        testcmtx = nbc.confusion_matrix(testcats, testclasscats)
        testcmtxstr = nbc.confusion_matrix_str(testcmtx)
        print "Test Data Confusion Matrix"
        print testcmtxstr

    dtest.addColumn("Classifiers", testclasscats)
    dtest.write("writtendatafile.csv")
Example #18
0
def main(argv):
    ''' Reads in a training set and a test set and builds a KNN classifer.
    Prints out confusion matrices and writes classifications 
    for test data to a CSV file. '''

    # usage
    if len(argv) < 3:
        print 'Usage: python %s <training data file> <test data file> <optional training category file> <optional test category file>' % (
            argv[0])
        exit(-1)

    # read the training and test sets
    dtrain = data.Data(argv[1])
    dtest = data.Data(argv[2])

    # get the categories and the training data A and the test data B
    if len(argv) > 4:
        traincatdata = data.Data(argv[3])
        testcatdata = data.Data(argv[4])
        traincats = traincatdata.get_data([traincatdata.get_headers()[0]])
        testcats = testcatdata.get_data([testcatdata.get_headers()[0]])
        A = dtrain.get_data(dtrain.get_headers())
        B = dtest.get_data(dtest.get_headers())
    else:
        # assume the categories are the last column
        traincats = dtrain.get_data([dtrain.get_headers()[-1]])
        testcats = dtest.get_data([dtest.get_headers()[-1]])
        A = dtrain.get_data(dtrain.get_headers()[:-1])
        B = dtest.get_data(dtest.get_headers()[:-1])

    # create KNN classifier
    knnc = classifiers.KNN()

    # build the classifier using the training data
    knnc.build(A, traincats)

    # use the classifier on the training data
    knnctraincats, knnctrainlabels = knnc.classify(A)
    print "For KNN (training data):"
    print knnc.confusion_matrix_str(
        knnc.confusion_matrix(traincats, knnctraincats))

    # use the classifier on the test data
    knnctestcats, knnctestlabels = knnc.classify(B)
    print "For KNN (test data):"
    print knnc.confusion_matrix_str(
        knnc.confusion_matrix(testcats, knnctestcats))

    # write test data to csv
    knncfile = open("knncOut.csv", 'w')
    writeFile = csv.writer(knncfile)
    if len(argv) > 4:
        knncHeaders = dtest.get_headers()
    else:
        knncHeaders = dtest.get_headers()[:-1]
    knncHeaders.append("Category")
    writeFile.writerow(knncHeaders)
    writeFile.writerow(["numeric"] * len(knncHeaders))
    for i in range(B.shape[0]):
        rowList = B[i, :].tolist()
        rowList[0].append(knnctestcats[i, 0])
        writeFile.writerow(rowList[0])
    knncfile.close()

    return
Example #19
0
m = composite.FeatureSelect(s, featsel.RFE())
r = m.cv(d, 3)

fs = featsel.FeatureScore('golub')
f = featsel.Filter(fs, sigma=2)
m = composite.FeatureSelect(s, f)
r = m.cv(d, 3)

d = datafunc.SparseDataSet('heart.data')
p = modelSelection.Param(svm.SVM(), 'C', [0.1, 1, 10, 100, 1000])
m = modelSelection.ModelSelector(p)
m.train(d)

d = datafunc.SparseDataSet('heartSparse.data')
p = modelSelection.Param(classifiers.KNN(), 'k', [1, 2, 3, 5, 10, 15])
m = modelSelection.ModelSelector(p)
m.train(d)

r = p.cv(d, numFolds=10)
results = [r for r in p.cv(d, numFolds=10)]
results = [r.successRate for r in p.cv(d, numFolds=10)]

d = datafunc.SparseDataSet('yeast.data', labelsColumn=0)

d = datafunc.SparseDataSet('yeast2.data', labelsColumn=1)

from PyML import *

d = datafunc.VectorDataSet('yeast3.data', labelsColumn=1)
Example #20
0
def main(argv):
    '''Reads in a training set and a test set and builds two KNN
	classifiers.  One uses all of the data, one uses 10
	exemplars. Then it classifies the test data and prints out the
	results.
	
	first part , reading in two input files, code is inspired by Bruce's code
	'''

    # usage
    if len(argv) < 3:
        print 'Usage: python %s <training data file> <test data file> <optional training category file> <optional test category file>' % (
            argv[0])
        exit(-1)
    """ Bruce KNN test code source starts here, with comments for my understanding """

    # read the training and test sets
    dtrain = data.Data(argv[1])
    dtest = data.Data(argv[2])

    # get the categories and the training data A and the test data B
    if len(argv) > 4:
        traincatdata = data.Data(argv[3])
        testcatdata = data.Data(argv[4])
        traincats = traincatdata.get_data([traincatdata.get_headers()[0]])
        testcats = testcatdata.get_data([testcatdata.get_headers()[0]])
        A = dtrain.get_data(dtrain.get_headers())
        B = dtest.get_data(dtest.get_headers())
    else:
        # assume the categories are the last column
        traincats = dtrain.get_data([dtrain.get_headers()[-1]
                                     ])  # training categories
        testcats = dtest.get_data([dtest.get_headers()[-1]])  # test categories
        A = dtrain.get_data(dtrain.get_headers()[:-1])  # train data matrice
        B = dtest.get_data(dtest.get_headers()[:-1])  # test data matrice

    # for float categories, turn them into ints
    new = []
    if type(traincats[0]) == float:

        for t in traincats:
            new.append(int(t))
    traincats = new
    new = []
    if type(testcats[0]) == float:
        new = []
        for t in testcats:
            new.append(int(t))
    testcats = new

    # create two classifiers, one using 10 exemplars per class
    knncall = classifiers.KNN()
    knnc10 = classifiers.KNN()

    #print type(type(traincats))
    # build the classifiers given data and categories
    knncall.build(A, traincats)
    knnc10.build(A, traincats, 10)  # specify K

    # use the classifiers on the test data, to try classify A
    classcats, alllabels = knncall.classify(A)
    tencats, tenlabels = knnc10.classify(A)
    """ #Bruce KNN test edited for my project code source ends here """

    # Classify the training set and print out a confusion matrix.
    # build confusion matrix and print it out
    confusion_matrix = knncall.confusion_matrix(traincats, classcats)  #
    # print out the confusion matrix
    cmtx = knncall.confusion_matrix_str(confusion_matrix)
    #print classcats
    print "   train set   confusion matrix \n ", cmtx

    # Classify the test set and print out a confusion matrix.

    # use the classifiers on the test data, to try classify B
    classcats, alllabels = knncall.classify(B)
    tencats, tenlabels = knnc10.classify(B)

    # build confusion matrix and print it out
    confusion_matrix = knncall.confusion_matrix(testcats, classcats)  #
    # print out the confusion matrix
    cmtx = knncall.confusion_matrix_str(confusion_matrix)
    #print classcats
    print "   test set   confusion matrix \n ", cmtx

    return
def main(argv):
    '''Reads in a training set and a test set and builds two KNN
    classifiers.  One uses all of the data, one uses 10
    exemplars. Then it classifies the test data and prints out the
    results.
    '''

    # usage
    if len(argv) < 3:
        print(
            'Usage: python %s <training data file> <test data file> <optional training category file> <optional test category file>'
            % (argv[0]))
        exit(-1)

    # read the training and test sets
    dtrain = data.Data(argv[1])
    dtest = data.Data(argv[2])

    # get the categories and the training data A and the test data B
    if len(argv) > 4:
        traincatdata = data.Data(argv[3])
        testcatdata = data.Data(argv[4])
        traincats = traincatdata.get_data([traincatdata.get_headers()[0]])
        testcats = testcatdata.get_data([testcatdata.get_headers()[0]])
        A = dtrain.get_data(dtrain.get_headers())
        B = dtest.get_data(dtest.get_headers())
    else:
        # assume the categories are the last column
        traincats = dtrain.get_data([dtrain.get_headers()[-1]])
        testcats = dtest.get_data([dtest.get_headers()[-1]])
        A = dtrain.get_data(dtrain.get_headers()[:-1])
        B = dtest.get_data(dtest.get_headers()[:-1])

    # create two classifiers, one using 10 exemplars per class
    knncall = classifiers.KNN()
    knnc10 = classifiers.KNN()

    # build the classifiers
    knncall.build(A, traincats)
    knnc10.build(A, traincats, 10)

    # use the classifiers on the test data
    allcats, alllabels = knncall.classify(B)

    tencats, tenlabels = knnc10.classify(B)

    # print the results
    print('Results using All Exemplars:')
    print('     True  Est')
    for i in range(allcats.shape[0]):
        if int(testcats[i, 0]) == int(alllabels[i, 0]):
            print("%03d: %4d %4d" %
                  (i, int(testcats[i, 0]), int(alllabels[i, 0])))
        else:
            print("%03d: %4d %4d **" %
                  (i, int(testcats[i, 0]), int(alllabels[i, 0])))

    print(knnc10)

    print('Results using 10 Exemplars:')
    print('     True  Est')
    for i in range(tencats.shape[0]):
        if int(testcats[i, 0]) == int(tenlabels[i, 0]):
            print("%03d: %4d %4d" %
                  (i, int(testcats[i, 0]), int(tenlabels[i, 0])))
        else:
            print("%03d: %4d %4d **" %
                  (i, int(testcats[i, 0]), int(tenlabels[i, 0])))

    return
def test_classifier(classifier, X, y):

    print("###### Testing " + classifier.value + " ######")
    worst_hit_rate = np.inf
    best_hit_rate = 0
    best_y_pred = np.array(0)
    worst_y_pred = np.array(0)
    best_y_test = np.array(0)
    worst_y_test = np.array(0)

    for j in range(5):

        print("*** Round {}:".format(j + 1))
        X_train, X_test, y_train, y_test = partition(X, y, 0.7)
        n_e = list(y_test).count(0)
        n_p = list(y_test).count(1)
        y_pred = np.empty([len(y_test)])

        true_positives_p = 0  # True positives class "p"
        true_positives_e = 0  # True positives class "e"
        true_positives = 0

        for i in range(len(X_test)):

            if classifier is Classifier.KNN:
                y_pred[i] = classifiers.KNN(X_test[i], X_train, y_train)
            elif classifier is Classifier.MDC:
                y_pred[i] = classifiers.MDC(X_test[i], X_train, y_train)
            elif classifier is Classifier.QC:
                y_pred[i] = classifiers.QC(X_test[i], X_train, y_train)

            if y_pred[i] == y_test[i]:
                true_positives += 1
                if y_pred[i] == 0:
                    true_positives_e += 1
                else:
                    true_positives_p += 1

        print("True positives: {} from {} samples".format(
            true_positives, len(y_test)))
        print("True positives e: {} from {} samples".format(
            true_positives_e, n_e))
        print("True positives p: {} from {} samples".format(
            true_positives_p, n_p))

        hit_rate = true_positives / len(y_test)
        hit_rate_e = true_positives_e / n_e
        hit_rate_p = true_positives_p / n_p

        print("Hit rate: {}".format(hit_rate))
        print("Hit rate e: {}".format(hit_rate_e))
        print("Hit rate p: {}".format(hit_rate_p))

        if hit_rate > best_hit_rate:
            best_y_pred = y_pred
            best_y_test = y_test
            best_hit_rate = hit_rate

        if hit_rate < worst_hit_rate:
            worst_y_pred = y_pred
            worst_y_test = y_test
            worst_hit_rate = hit_rate

    print("Best result: {}".format(best_hit_rate))
    print("Confusion matrix best result: ")
    cnf_matrix_best = confusion_matrix(best_y_test, best_y_pred)
    print(cnf_matrix_best)

    print("Worst result: {}".format(worst_hit_rate))
    print("Confusion matrix worst result: ")
    cnf_matrix_worst = confusion_matrix(worst_y_test, worst_y_pred)
    print(cnf_matrix_worst)
Example #23
0
def main(argv):

    # usage
    if len(argv) < 4:
        print("Usage: python3 %s <data.csv> <metadata.csv> <0 - KNN; 1 -ANN>" %
              argv[0])
        exit()

    datafilename = argv[1]
    metadatafilename = argv[2]
    classifierType = int(argv[3])

    # read data
    datamat = np.genfromtxt(datafilename, delimiter=',')
    # print(datamat)
    # print(datamat.shape)

    # numdata = 3000
    numdata = datamat.shape[0]

    datamat = datamat[:numdata, :]

    data = datamat[:, 1:].astype(np.float32)
    # print(data)
    # print(data.shape)

    # read labels
    labelsmat, dict = readlabels(metadatafilename)
    # print(dict)
    # print(labelsmat)
    inv_dict = {v: k for k, v in dict.items()}

    labelsmat = labelsmat[:numdata, :]

    # print(labelsmat.shape)

    unique, counts = np.unique(labelsmat, return_counts=True, axis=0)
    # print(unique)
    # print(counts)

    #############################################

    # get top 25 labels

    top25idx = np.argsort(counts)[::-1].tolist()[:25]
    top25idx = unique[top25idx, :].T.tolist()[0]
    # print(top25idx)

    data_top25 = []
    labels_top25 = []

    for i in range(data.shape[0]):
        if labelsmat[i, 0] in top25idx:
            labels_top25.append(labelsmat[i, 0])
            data_top25.append(data[i, :])

    data_top25 = np.matrix(data_top25)
    labels_top25 = np.matrix(labels_top25).T
    # print(data_top25.shape)
    # print(len(labels_top25))

    unique_top25, inverse_top25, counts_top25 = np.unique(labels_top25,
                                                          return_counts=True,
                                                          return_inverse=True,
                                                          axis=0)
    # print(unique_top25)
    # print(counts_top25)

    print("Top 25 Labels:")
    for i in range(unique_top25.shape[0]):
        print(i, " : ", inv_dict[unique_top25[i, 0]], ", ", counts_top25[i])
    #

##################################### ANN #############
    if classifierType == 1:
        print("******************ANN classifier:**************")

        print(data_top25.shape[0])
        # print(inverse_top25.shape)

        data_top25_train = []
        data_top25_test = []
        labels_top25_train = []
        labels_top25_test = []

        for i in range(labels_top25.shape[0]):
            if np.random.random() > 0.2:
                labels_top25_train.append(inverse_top25[i])
                data_top25_train.append(data_top25[i, :])
            else:
                labels_top25_test.append(inverse_top25[i])
                data_top25_test.append(data_top25[i, :])

        data_top25_train = np.vstack(data_top25_train)
        labels_top25_train = np.matrix(labels_top25_train).T
        data_top25_test = np.vstack(data_top25_test)
        labels_top25_test = np.matrix(labels_top25_test).T

        # print("text saved")
        # np.savetxt("../results/types.csv", labels_top25_train, delimiter=",", fmt="%.6f")
        # nnc = classifiers.NeuralNet(data_top25_train, labels_top25_train)
        # nnc.train()
        # print("NN training done")
        # test_data = data
        # test_cats = labelsmat

        # print("training data")

        nnc = classifiers.NeuralNet(data_top25_test, labels_top25_test)
        print("NN testing data")
        test_new_cats = nnc.classify(data_top25_test)
        print(labels_top25_test.shape)
        print("NN fisnished prediction")
        print(nnc.accuracy(labels_top25_test, test_new_cats))


##############################
    elif classifierType == 0:
        print("******************KNN classifier:**************")

        # split training testing

        print(data_top25.shape[0])
        # print(inverse_top25.shape)

        data_top25_train = []
        data_top25_test = []
        labels_top25_train = []
        labels_top25_test = []

        for i in range(labels_top25.shape[0]):
            if np.random.random() > 0.2:
                labels_top25_train.append(inverse_top25[i])
                data_top25_train.append(data_top25[i, :])
            else:
                labels_top25_test.append(inverse_top25[i])
                data_top25_test.append(data_top25[i, :])

        data_top25_train = np.vstack(data_top25_train)
        labels_top25_train = np.matrix(labels_top25_train).T
        data_top25_test = np.vstack(data_top25_test)
        labels_top25_test = np.matrix(labels_top25_test).T

        # print(data_top25_train.shape)
        # print(labels_top25_train.shape)
        # print(data_top25_test.shape)
        # print(labels_top25_test.shape)
        #
        # print(data_top25_train)
        # print(labels_top25_train)
        # print(data_top25_test)
        # print(labels_top25_test)

        #
        # CLASSIFY
        K = 7
        print('Building KNN Classifier (K=%d)' % K)
        knnc = classifiers.KNN(data_top25_train, labels_top25_train, K)

        print('KNN Training Set Results')

        newcats, newlabels = knnc.classify(data_top25_train)

        accuracy = knnc.accuracy(labels_top25_train, newlabels)
        print("Training accuracy", accuracy)

        confmtx = knnc.confusion_matrix(labels_top25_train, newlabels)

        plt.matshow(confmtx)
        plt.title("Training: %d data; %.4f accruacy." %
                  (labels_top25_train.shape[0], accuracy))
        plt.savefig("../results/training.png", dpi=300)

        print(knnc.confusion_matrix_str(confmtx))

        print('KNN Test Set Results')

        newcats, newlabels = knnc.classify(data_top25_test)

        accuracy = knnc.accuracy(labels_top25_test, newlabels)

        print("Testing accuracy", accuracy)

        # print the confusion matrix
        confmtx = knnc.confusion_matrix(labels_top25_test, newlabels)

        plt.matshow(confmtx)
        plt.title("Testing: %d data; %.4f accruacy." %
                  (labels_top25_test.shape[0], accuracy))
        plt.savefig("../results/testing.png", dpi=300)

        print(knnc.confusion_matrix_str(confmtx))

    else:
        print("invalid argv[-1]")
        print(
            "Usage: python3 %s <data.csv> <metadata.csv> <0 - KNN; 1 - ANN>" %
            argv[0])
        sys.exit()
Example #24
0
def classify(trainingSet,
             testSet,
             bayes=True,
             optrainingCats=None,
             optestCats=None,
             outputFile="KNN.csv"):
    print("in classify")
    dtrain = data.Data(trainingSet)
    dtest = data.Data(testSet)
    if optrainingCats != None:
        trainHeaders = dtrain.get_headers()
        trainCats = data.Data(optrainingCats)
        trainCatsData = trainCats.newMatrix(trainCats.get_headers())
    else:
        trainHeaders = dtrain.get_headers()[:-1]
        trainCatsData = dtrain.newMatrix([dtrain.get_headers()[-1]])

    if optestCats != None:
        testHeaders = dtrain.get_headers()
        testCats = data.Data(optestCats)
        testCatsData = testCats.newMatrix(testCats.get_headers())
    else:
        testHeaders = dtrain.get_headers()[:-1]
        testCatsData = dtest.newMatrix([dtest.get_headers()[-1]])

    if bayes:
        nbc = classifiers.NaiveBayes(dtrain, trainHeaders, trainCatsData)

        print('Naive Bayes Training Set Results')
        A = dtrain.newMatrix(trainHeaders)

        newcats, newlabels = nbc.classify(A)

        uniquelabels, correctedtraincats = np.unique(
            trainCatsData.T.tolist()[0], return_inverse=True)
        correctedtraincats = np.matrix([correctedtraincats]).T

        confmtx = nbc.confusion_matrix(correctedtraincats, newcats)
        print(nbc.confusion_matrix_str(confmtx))
        print('Naive Bayes Test Set Results')
        A = dtest.newMatrix(testHeaders)

        newcats, newlabels = nbc.classify(A)
        uniquelabels, correctedtestcats = np.unique(testCatsData.T.tolist()[0],
                                                    return_inverse=True)
        correctedtestcats = np.matrix([correctedtestcats]).T

        confmtx = nbc.confusion_matrix(correctedtestcats, newcats)
        print(nbc.confusion_matrix_str(confmtx))

        with open(outputFile, mode='w') as file:
            dataToWrite = A.tolist()
            writer = csv.writer(file)
            testHeaders.append("predicted categories")
            writer.writerow(testHeaders)
            writer.writerow(["numeric" for i in range(len(testHeaders))])
            for i in range(len(dataToWrite)):
                dataToWrite[i].append(newcats[i, 0])
                writer.writerow(dataToWrite[i])

    else:
        print('Building KNN Classifier')
        knnc = classifiers.KNN(dtrain, trainHeaders, trainCatsData, 5)

        print('KNN Training Set Results')
        A = dtrain.newMatrix(trainHeaders)

        newcats, newlabels = knnc.classify(A)
        uniquelabels, correctedtraincats = np.unique(
            trainCatsData.T.tolist()[0], return_inverse=True)
        correctedtraincats = np.matrix([correctedtraincats]).T

        confmtx = knnc.confusion_matrix(correctedtraincats, newcats)
        print(knnc.confusion_matrix_str(confmtx))

        print('KNN Test Set Results')
        A = dtest.newMatrix(testHeaders)

        newcats, newlabels = knnc.classify(A)

        uniquelabels, correctedtestcats = np.unique(testCatsData.T.tolist()[0],
                                                    return_inverse=True)
        correctedtestcats = np.matrix([correctedtestcats]).T

        # print the confusion matrix
        confmtx = knnc.confusion_matrix(correctedtestcats, newcats)
        print(knnc.confusion_matrix_str(confmtx))

        with open(outputFile, mode='w') as file:
            dataToWrite = A.tolist()
            writer = csv.writer(file)
            testHeaders.append("predicted categories")
            writer.writerow(testHeaders)
            writer.writerow(["numeric" for i in range(len(testHeaders))])
            for i in range(len(dataToWrite)):
                dataToWrite[i].append(newcats[i, 0])
                writer.writerow(dataToWrite[i])