예제 #1
0
def get_test_label_list(train_file_route, test_file_route, algorithm,
                        classifier):
    if algorithm == 'tf_idf':
        train_feature_sparse_matrix, train_label_list = algo.tf_idf(
            train_file_route)
        test_feature_sparse_matrix, test_label_list = algo.tf_idf(
            test_file_route)
    if algorithm == 'tf_dc':
        train_feature_sparse_matrix, train_label_list = algo.tf_dc(
            train_file_route)
        test_feature_sparse_matrix, test_label_list = algo.tf_dc(
            test_file_route)
    else:
        train_feature_sparse_matrix, train_label_list = algo.tf_bdc(
            train_file_route)
        test_feature_sparse_matrix, test_label_list = algo.tf_bdc(
            test_file_route)

    if classifier == 'KNN':
        predict_test_label_list = clf.KNN(train_feature_sparse_matrix,
                                          train_label_list,
                                          test_feature_sparse_matrix)
    else:
        predict_test_label_list = clf.SVM(train_feature_sparse_matrix,
                                          train_label_list,
                                          test_feature_sparse_matrix)
    return test_label_list, predict_test_label_list
예제 #2
0
def main(argv):
    '''Builds two KNN classifiers and prints them out.  The first uses all
    of the exemplars, the second uses only 10.

    '''

    # usage
    if len(argv) < 2:
        print 'Usage: python %s <data file> <optional category file>' % (
            argv[0])
        exit(-1)

    # read the data
    d = data.Data(argv[1])

    # get the categories and data matrix
    if len(argv) > 2:
        catdata = data.Data(argv[2])
        cats = catdata.get_data([catdata.get_headers()[0]])
        A = d.get_data(d.get_headers())
    else:
        # assume the categories are the last column
        cats = d.get_data([d.get_headers()[-1]])
        A = d.get_data(d.get_headers()[:-1])

    # create a new classifier
    knnc = classifier.KNN()

    # build the classifier using all exemplars
    knnc.build(A, cats)

    # print the classifier
    # requires a __str__ method
    print knnc

    # build and print the classifier using 10 exemplars per class
    knnc2 = classifier.KNN()
    knnc2.build(A, cats, 10)
    print knnc2

    return
def getRating(tweet):
    #pre process
    useableTweetDict = preProcess(tweet)

    #build feature vector
    fv = getFeatureVector(useableTweetDict)

    #load feature vectors of trained data
    trainSetFV, trainLabels = loadTrainSetFV()   #array of arrays, array

    #load the knn classifier
    knn = classifier.KNN(k=8)

    #train the classifier
    knn.train(trainSetFV, trainLabels)

    #classify the instance
    rating = knn.predict(fv)

    return rating
예제 #4
0
def main(argv):
    '''Reads in a training set and a test set and builds two KNN
    classifiers.  One uses all of the data, one uses 10
    exemplars. Then it classifies the test data and prints out the
    results.
    '''

    # usage
    if len(argv) < 3:
        print 'Usage: python %s <training data file> <test data file> <optional training category file> <optional test category file>' % (argv[0])
        exit(-1)

    # read in the training set
    data_train = data.Data(argv[1])
    # read in the test set
    data_test = data.Data(argv[2])

    # compatibility check length or argv
    if len(argv) > 4:
    	# get the categories of the training data 
        train_cat_data = data.Data(argv[3])
        train_cats = train_cat_data.get_data( [train_cat_data.get_headers()[0]] )
        # get the categories of the test data 
        test_cat_data = data.Data(argv[4])
        test_cats = test_cat_data.get_data( [test_cat_data.get_headers()[0]] )
        # get the training data A and the test data B
        A = data_train.get_data( data_train.get_headers() )
        B = data_test.get_data( data_test.get_headers() )
    else:
        # just assume the categories are the last column
        train_cats = data_train.get_data( [data_train.get_headers()[-1]] )
        test_cats = data_test.get_data( [data_test.get_headers()[-1]] )
        A = data_train.get_data( data_train.get_headers()[:-1] )
        B = data_test.get_data( data_test.get_headers()[:-1] )

#----------------------------------------------------------------------- 
    # create two classifiers
    knnClass = classifier.KNN()
	print "Created Classifier, Building Now."
예제 #5
0
def main(argv):
    '''Reads in a training set and a test set and builds two KNN
    classifiers.  One uses all of the data, one uses 10
    exemplars. Then it classifies the test data and prints out the
    results.
    '''

    # usage
    if len(argv) < 3:
        print 'Usage: python %s <training data file> <test data file> <optional training category file> <optional test category file>' % (
            argv[0])
        exit(-1)

    # read in the training set
    data_train = data.Data(argv[1])
    # read in the test set
    data_test = data.Data(argv[2])

    # compatibility check length or argv
    if len(argv) > 4:
        # get the categories of the training data
        train_cat_data = data.Data(argv[3])
        train_cats = train_cat_data.get_data([train_cat_data.get_headers()[0]])
        # get the categories of the test data
        test_cat_data = data.Data(argv[4])
        test_cats = test_cat_data.get_data([test_cat_data.get_headers()[0]])
        # get the training data A and the test data B
        A = data_train.get_data(data_train.get_headers())
        B = data_test.get_data(data_test.get_headers())
    else:
        # just assume the categories are the last column
        train_cats = data_train.get_data([data_train.get_headers()[-1]])
        test_cats = data_test.get_data([data_test.get_headers()[-1]])
        A = data_train.get_data(data_train.get_headers()[:-1])
        B = data_test.get_data(data_test.get_headers()[:-1])


#-----------------------------------------------------------------------
# create two classifiers
    knnClass = classifier.KNN()
    print "Created Classifier, Building Now."
    # build the classifiers
    knnClass.build(A, train_cats)
    print "Built! Now classifying."

    #-----------------------------------------------------------------------
    #-Classifies the training set data and prints out a confusion matrix.
    acats, alabels = knnClass.classify(A)
    print "Done Classifying."

    unique, mapping = np.unique(np.array(train_cats.T), return_inverse=True)
    unique2, mapping2 = np.unique(np.array(alabels.T), return_inverse=True)

    mtx = knnClass.confusion_matrix(
        np.matrix(mapping).T,
        np.matrix(mapping2).T)
    print "Training Confusion Matrix:"
    print knnClass.confusion_matrix_str(mtx)

    #-----------------------------------------------------------------------
    #-----------------------------------------------------------------------
    #-Classifies the test set data and prints out a confusion matrix.
    bcats, blabels = knnClass.classify(B)
    print "Done Classifying."

    unique, mapping = np.unique(np.array(test_cats.T), return_inverse=True)
    unique2, mapping2 = np.unique(np.array(blabels.T), return_inverse=True)

    mtx1 = knnClass.confusion_matrix(
        np.matrix(mapping).T,
        np.matrix(mapping2).T)
    print "Test Confusion Matrix:"
    print knnClass.confusion_matrix_str(mtx1)

    #-----------------------------------------------------------------------
    #Writes out a new CSV data file with the test set data
    # and the categories as an extra column
    data_test.addColumn("KNN Classification", bcats)
    data_test.toFile(filename="knnClass.csv")

    return
    Xtrain, Ytrain)
test_err = classifier.decision_tree(Xtrain,
                                    Ytrain,
                                    Xtest,
                                    Ytest,
                                    depth=max_depth)
print "Decision Tree Classifier\n"
print "Validation Error = ", val_err
print "Training Error = ", train_err
print "Testing Error = ", test_err
print "Optimal Depth = ", max_depth
print "\n"
''' K Nearest Neighbor '''
val_err, train_err, opt_K = classifier.K_Fold_crossValidation_KNN(
    Xtrain, Ytrain)
test_err = classifier.KNN(Xtrain, Ytrain, Xtest, Ytest, K=opt_K)
print "K Nearest Neighbor Classifier\n"
print "Validation Error = ", val_err
print "Training Error = ", train_err
print "Testing Error = ", test_err
print "Optimal K = ", opt_K
print "\n"
''' SVM - linear kernel '''
val_err, train_err, opt_C = classifier.K_Fold_crossValidation_SVM(Xtrain,
                                                                  Ytrain,
                                                                  ker='linear')
test_err = classifier.SVM(Xtrain,
                          Ytrain,
                          Xtest,
                          Ytest,
                          ker='linear',
예제 #7
0
__author__ = "Harshilkumar Patel"
__status__ = "Development"

import config
import classifier
from utils import logger
import constants

data = config.get_training_data()

if constants.CLASSIFIER_CHOICE == "knn":
    Classifier = classifier.KNN(data)
else:
    Classifier = classifier.NaiveBayes(data)

logger.debug("formatted data is %s", Classifier.data)

result = Classifier.predict(config.get_training_data("input.txt")[1])
logger.debug("THE FINAL PREDICTION is %s", result)

# f = open('output.txt', 'w')
# f.write(result)
# f.close()
예제 #8
0
def main(argv):
    '''Reads in a training set and a test set and builds two KNN
    classifiers.  One uses all of the data, one uses 10
    exemplars. Then it classifies the test data and prints out the
    results.
    '''

    # usage
    if len(argv) < 3:
        print 'Usage: python %s <training data file> <test data file> <optional training category file> <optional test category file>' % (
            argv[0])
        exit(-1)

    # read the training and test sets
    dtrain = data.Data(argv[1])
    dtest = data.Data(argv[2])

    # get the categories and the training data A and the test data B
    if len(argv) > 4:
        traincatdata = data.Data(argv[3])
        testcatdata = data.Data(argv[4])
        traincats = traincatdata.get_data([traincatdata.get_headers()[0]])
        testcats = testcatdata.get_data([testcatdata.get_headers()[0]])
        A = dtrain.get_data(dtrain.get_headers())
        B = dtest.get_data(dtest.get_headers())
    else:
        # assume the categories are the last column
        traincats = dtrain.get_data([dtrain.get_headers()[-1]])
        testcats = dtest.get_data([dtest.get_headers()[-1]])
        A = dtrain.get_data(dtrain.get_headers()[:-1])
        B = dtest.get_data(dtest.get_headers()[:-1])

    # create two classifiers, one using 10 exemplars per class
    knncall = classifier.KNN()
    knnc10 = classifier.KNN()

    # build the classifiers
    knncall.build(A, traincats)
    knnc10.build(A, traincats, 10)

    # use the classifiers on the test data
    allcats, alllabels = knncall.classify(B)

    tencats, tenlabels = knnc10.classify(B)

    # print the results
    print 'Results using All Exemplars:'
    print '     True  Est'
    for i in range(allcats.shape[0]):
        if int(testcats[i, 0]) == int(allcats[i, 0]):
            print "%03d: %4d %4d" % (i, int(testcats[i, 0]), int(allcats[i,
                                                                         0]))
        else:
            print "%03d: %4d %4d **" % (i, int(
                testcats[i, 0]), int(allcats[i, 0]))

    print knnc10

    print 'Results using 10 Exemplars:'
    print '     True  Est'
    for i in range(tencats.shape[0]):
        if int(testcats[i, 0]) == int(tencats[i, 0]):
            print "%03d: %4d %4d" % (i, int(testcats[i, 0]), int(tencats[i,
                                                                         0]))
        else:
            print "%03d: %4d %4d **" % (i, int(
                testcats[i, 0]), int(tencats[i, 0]))

    return