Beispiel #1
0
def test(training_file, test_file,method="rf"):
    print "testing..."
    sys.stdout.write("%s:training... "%(strftime("%Y-%m-%d %H:%M:%S", gmtime())))
    sys.stdout.flush()
    classifier=None
    if method=="gb":
        classifier = GBClassifier(training_file)
    elif method=="et":
        classifier = ETClassifier(training_file)
    elif method=='svm':
        svm.test(training_file,test_file)
        print "(%s) DONE." % (strftime("%Y-%m-%d %H:%M:%S", gmtime()))
        sys.exit(0)
    else:
        classifier = RFClassifier(training_file,100)
    print "(%s) DONE." % (strftime("%Y-%m-%d %H:%M:%S", gmtime()))
    #train_classifier(training_file)

    my_data = genfromtxt(test_file, delimiter='\t',skip_header=0)
    n_col = my_data.shape[1]
    n_features=n_col-1  #assuming that the latest column
                        #contains the the outputs 
    #for testing
    X = (np.hsplit(my_data,[n_features,n_col])[0])
    Y = np.squeeze(np.asarray(np.hsplit(my_data,[n_features,n_col])[1]))    

    predictions = classifier.predict(X)
    
    #compute classification accurancy 
    if (np.unique(Y).size==2):
        #auc and roc for binary classification
        fpr, tpr, thresholds = metrics.roc_curve(Y, predictions)
        print "auc/roc report: "
        print fpr, tpr, metrics.auc(fpr, tpr), thresholds
        print "full classification report: "
        print metrics.classification_report(Y,predictions)
        print "report for the rarest class: "
        print metrics.classification_report(Y,predictions,labels=[1])
    else:
        print 'nDCG'
        head_list_limit=None
        print compute_nDCG(Y, predictions,head_list_limit)
        print 'nDCG 2'
        print compute_nDCG_2(Y, predictions,head_list_limit)#,108801,28032
        #precision for multi-class (results between 0-1)
        print "precision score: "+str(metrics.precision_score(Y,predictions,None,None,average='weighted'))
        print "full classification report: "
        print metrics.classification_report(Y,predictions)
        print "report for the rarest class: "
        print metrics.classification_report(Y,predictions,labels=[1])
def evaluate_one_doc(clf_name, clf, phrases, features, true_keys, N=10):
    pred_idx = []
    if clf_name == 'NB':
        pred_idx = NB.test(clf, N, features)
    if clf_name == 'svm':
        pred_idx= svm.test(clf, N, features)

    pred_keys = []
    print "# pred_keys", len(pred_keys)
    # get top N pred keys
    for idx in pred_idx:
        pred_keys.append(phrases[idx])
    ###
    print "--pred_keys:"
    print pred_keys
    print "--true keys:"
    print true_keys
    ###
    precision = get_precision(true_keys, pred_keys)
    recall = get_recall(true_keys, pred_keys)
    return precision, recall
SVM = svm.train(gama = 0.001,
                descriptor_name = arguments.descriptor,
                model_name = 'SVM')

# Print
print('Done!\n')

# Print
print('Testing Support Vector Machine Model\n')

# Test SVM Model
print('Testing %s SVM Model\n' % arguments.descriptor, file = globals.file)

# SVM Model 
SVM_predict = svm.test(model = SVM,
                       descriptor_name = arguments.descriptor,
                       model_name = 'SVM')

# Print
print('Done!\n')

# Print
print('Classification Report\n')

# Print
print('Classification Report\n', file = globals.file)

# SVM Model 
svm.classificationReport(model = SVM,
                         predict = SVM_predict,
                         descriptor_name = arguments.descriptor,
def bootstrapping(B, X, y, C):

    accuracy = np.zeros(B)
    precision = np.zeros(B)
    recall = np.zeros(B)
    specificity = np.zeros(B)

    n, d = X.shape
    bs_err = np.zeros(B)
    for b in range(B):
        train_samples = list(np.random.randint(0, n, n))
        test_samples = list(set(range(n)) - set(train_samples))

        # train the model
        theta = svm.train(X[train_samples], y[train_samples], C)

        testSet = X[test_samples]
        testLabels = y[test_samples]
        n2, d2 = testSet.shape

        tp = 0
        tn = 0
        fp = 0
        fn = 0

        for j in xrange(n2):
            # extract the test point and test label
            test_point = testSet[j, :].T
            test_label = testLabels[j]
            # count if the test was good or not

            # test the model
            testResult = svm.test(theta, test_point)

            if testResult == 1 and test_label == 1:
                tp += 1
            if testResult == 1 and test_label == -1:
                fp += 1
            if testResult == -1 and test_label == 1:
                fn += 1
            if testResult == -1 and test_label == -1:
                tn += 1

        #print 'tp, tn, fp, fn'
        #print tp, tn, fp, fn
        #print ''

        try:
            accuracy[b] = float(tp + tn) / float(fn + fp + tp + tn)
        except ZeroDivisionError:
            accuracy[b] = 0.0

        try:
            recall[b] = float(tp) / float(tp + fn)
        except ZeroDivisionError:
            recall[b] = 0.0

        try:
            precision[b] = float(tp) / float(tp + fp)
        except ZeroDivisionError:
            precision[b] = 0.0

        try:
            specificity[b] = float(tn) / float(tn + fp)
        except ZeroDivisionError:
            specificity[b] = 0.0

        error = np.ones(B)
        error -= accuracy

    return accuracy, error, recall, precision, specificity

    return bs_err
Beispiel #5
0
	test_x = [numpy.array([float(i) for i in x[2:]]) for x in temp_data[b:]]
	test_y = [1.0 if x[1] == 'M' else -1.0 for x in temp_data[b:]]
	return train_x, train_y, test_x, test_y

def main():
	f = open('wdbc.data')
	lines = f.readlines()
	data = [x for x in lines]
	return data

data = main()
train_x, train_y, test_x, test_y = split(data, 0.4)

c = svm.get_c(train_x, train_y)
tsvm = svm.train(train_x, train_y, c)
psvm, rsvm = svm.test(test_x, test_y, tsvm)
esvm = 2 * psvm * rsvm / (psvm + psvm)

print("svm:")
print("\tF1 %.3f " %esvm)
print("\tprecision %.3f, recall %.3f" %(psvm, rsvm))

tp = perceptrone.train(train_x, train_y)
pp, rp = perceptrone.test(test_x, test_y, tp) 
ep = 2 * pp * rp / (pp + rp)

print("lp:")
print("\tF1 %.3f " %ep)
print("\tprecision %.3f, recall %.3f" %(pp, rp))

c = svm_smo.get_c(train_x, train_y, kernels.poly)
Beispiel #6
0
data = main()
train_x, train_y, test_x, test_y = split(data, 0.4)

c = nw.get_c(train_x, train_y)
w1, w2 = nw.train(train_x, train_y, c)
pnw, rnw = nw.test(test_x, test_y, w1, w2)
enw = 2 * pnw * rnw / (pnw + rnw)

print("nw:")
print("\tF1 %.3f " % enw)
print("\tprecision %.3f, recall %.3f" % (pnw, rnw))

c = svm.get_c(train_x, train_y)
tsvm = svm.train(train_x, train_y, c)
psvm, rsvm = svm.test(test_x, test_y, tsvm)
esvm = 2 * psvm * rsvm / (psvm + psvm)

print("svm:")
print("\tF1 %.3f " % esvm)
print("\tprecision %.3f, recall %.3f" % (psvm, rsvm))

tp = perceptrone.train(train_x, train_y)
pp, rp = perceptrone.test(test_x, test_y, tp)
ep = 2 * pp * rp / (pp + rp)

print("lp:")
print("\tF1 %.3f " % ep)
print("\tprecision %.3f, recall %.3f" % (pp, rp))

c = svm_smo.get_c(train_x, train_y, kernels.poly)
        #generate traing data
        x = np.concatenate((train_data_1,train_data_2), axis=0)
        y = np.concatenate((train_label_1,train_label_2), axis=0)


        #traning a model
        svm_rbf = svm.train(x,y,"rbf")
        svm_linear = svm.train(x,y,"linear")
        w,mean = linear.train(x,y)

        #generate test data
        test_data  = np.concatenate((test_data_1,test_data_2), axis=0)
        test_label = np.concatenate((test_label_1,test_label_2), axis=0)

        #prediction
        svm_rbf_label = svm.test(test_data,svm_rbf)
        linear_label = linear.test(test_data,w,mean)
        svm_linear_label = svm.test(test_data,svm_linear)

        #get result
        svm_rbf_error = error_rate(test_label,svm_rbf_label)
        linear_error = error_rate(test_label,linear_label)
        svm_linear_error = error_rate(test_label,svm_linear_label)
        mle_error = testing_mle.testing(train_data_1,test_data_1,train_data_2,test_data_2)
        parzen_error = testing_parzen.testing(train_data_1,test_data_1,train_data_2,test_data_2,3)

        print "svm error(rbf):",svm_rbf_error
        print "svm error(linear):",svm_linear_error
        print "linaer_classifer error:",linear_error
        print "mle error:",mle_error
        print "parzen error:",parzen_error
	return numpy.array(features)

def kim_tfidf_ngrams(filename):
	return uni_features, bi_features

def many_sentiment(filename):
	return sentiment.get_sentiment_counts(filename)

if __name__ == "__main__":
	train_file = '/home/ak/Courses/cs73/project/dataset/small_train.txt'
	kim = kim_pos(train_file) # 5 features
	zhang = zhang_pos(train_file) # 7 features
	sent = many_sentiment(train_file) # 2 features

	X_train = numpy.hstack((kim, zhang, sent))
	t_train = svm.compile_targets(train_file)

	model = svm.train(X_train, t_train)

	test_file = '/home/ak/Courses/cs73/project/dataset/small_test.txt'
	kim = kim_pos(test_file) # 5 features
	zhang = zhang_pos(test_file) # 7 features
	sent = many_sentiment(test_file) # 2 features

	X_test = numpy.hstack((kim, zhang, sent))
	t_test = svm.compile_targets(test_file)

	y_pred = svm.test(model, X_test)
	metrics.run_classification_metrics(t_test, y_pred)
Beispiel #9
0
def cross_validation(X, y, foldcount, C):

    accuracy = np.zeros(foldcount)
    precision = np.zeros(foldcount)
    recall = np.zeros(foldcount)
    specificity = np.zeros(foldcount)
    n, d = X.shape

    # extract k folds from the data
    split = cross_validation_split(y, foldcount)

    # running k fold x validation
    for j in xrange(foldcount):

        # breaking up the folds into train and test
        trainInd = []
        testInd = split[j]
        for i in xrange(foldcount):
            if j == i:
                continue
            trainInd += split[i]

        # construct the training and testing sets

        trainSet = X[trainInd]
        trainLabels = y[trainInd]

        testSet = X[testInd]
        testLabels = y[testInd]

        # train the model
        theta = svm.train(trainSet, trainLabels, C)

        n = len(testInd)
        # Matt is terrible

        # getting information on the statistical results
        tp = 0
        tn = 0
        fp = 0
        fn = 0
        for i in xrange(n):
            # extract the test point and test label
            test_point = testSet[i]
            test_label = testLabels[i]
            # count if the test was good or not

            # test the model
            testResult = svm.test(theta, test_point)

            if testResult == 1 and test_label == 1:
                tp += 1
            if testResult == 1 and test_label == -1:
                fp += 1
            if testResult == -1 and test_label == 1:
                fn += 1
            if testResult == -1 and test_label == -1:
                tn += 1

        # making sure there are no zero denominators
        # probably unnecessary but just in case
        #print 'tp, tn, fp, fn'
        #print tp, tn, fp, fn
        #print ''

        try:
            accuracy[j] = float(tp + tn) / float(fn + fp + tp + tn)
        except ZeroDivisionError:
            accuracy[j] = 0.0

        try:
            recall[j] = float(tp) / float(tp + fn)
        except ZeroDivisionError:
            recall[j] = 0.0

        try:
            precision[j] = float(tp) / float(tp + fp)
        except ZeroDivisionError:
            precision[j] = 0.0

        try:
            specificity[j] = float(tn) / float(tn + fp)
        except ZeroDivisionError:
            specificity[j] = 0.0

        error = np.ones(foldcount)
        error -= accuracy

    return accuracy, error, recall, precision, specificity
def main():
    train_file = '/home/ak/Courses/cs73/project/dataset/small_train.txt'
    test_file = '/home/ak/Courses/cs73/project/dataset/small_test.txt'

    sent_included = False
    train_feats = []
    test_feats = []
    if 'k' in sys.argv:
        kim_train, kim_test = kim_features(train_file, test_file)
        train_feats.append(kim_train)
        test_feats.append(kim_test)
        if not sent_included:
            train_feats.append(many_sentiment(train_file))
            test_feats.append(many_sentiment(test_file))
            sent_included = True
    if 'o' in sys.argv:
        train_feats.append(omahony_features(train_file))
        test_feats.append(omahony_features(test_file))
        if not sent_included:
            train_feats.append(many_sentiment(train_file))
            test_feats.append(many_sentiment(test_file))
            sent_included = True
    if 'l' in sys.argv:
        train_feats.append(liu_features(train_file))
        test_feats.append(liu_features(test_file))
        if not sent_included:
            train_feats.append(many_sentiment(train_file))
            test_feats.append(many_sentiment(test_file))
            sent_included = True
    if 'z' in sys.argv:
        train_feats.append(zhang_features(train_file))
        test_feats.append(zhang_features(test_file))
        sent_included = True
        if not sent_included:
            train_feats.append(many_sentiment(train_file))
            test_feats.append(many_sentiment(test_file))
            sent_included = True
    if 't' in sys.argv:
        tfidf_train, tfidf_test = tfidf_ngrams(train_file,
                                               test_file,
                                               with_lsi=False)
        train_feats.append(tfidf_train)
        test_feats.append(tfidf_test)
    if 's' in sys.argv:
        train_feats.append(many_sentiment(train_file))
        test_feats.append(many_sentiment(test_file))
    if 'tl' in sys.argv:
        tfidf_train, tfidf_test = tfidf_ngrams(train_file,
                                               test_file,
                                               with_lsi=True)
        train_feats.append(tfidf_train)
        test_feats.append(tfidf_test)
    if 'bp' in sys.argv:
        train_feats.append(kim_pos(train_file))
        test_feats.append(kim_pos(test_file))

    X_train = None
    X_test = None
    if len(train_feats) > 1:
        X_train = scipy.sparse.hstack(train_feats)
        X_test = scipy.sparse.hstack(test_feats)
    else:
        X_train = train_feats[0]
        X_test = test_feats[0]

    svm.normalize(X_train)
    svm.normalize(X_test)

    # Classification
    # SV
    t_train_thresh = svm.compile_targets(train_file)
    t_test_thresh = svm.compile_targets(test_file)

    clf = ExtraTreesClassifier()
    X_new = clf.fit(X_train.toarray(), t_train_thresh).transform(X_train)
    if clf.feature_importances_.shape[0] < 500:
        for i in xrange(clf.feature_importances_.shape[0]):
            print i, clf.feature_importances_[i]
    '''bsvm = SVC(kernel="linear")
	selector = RFECV(bsvm, step=10)
	selector.fit(X_train, t_train_thresh)
	print selector.support_
	print selector.ranking_
	raw_input()'''

    class_model = None
    y_pred = None
    if 'rf' not in sys.argv:
        class_model = svm.train(X_train, t_train_thresh)
        y_pred = svm.test(class_model, X_test)
    else:
        class_model = rfc.train(X_train.todense(), t_train_thresh)
        y_pred = rfc.test(class_model, X_test.todense())
    metrics.run_classification_metrics(t_test_thresh, y_pred)
    print

    # Regression
    # SVR
    t_train = svr.compile_targets(train_file)
    t_test = svr.compile_targets(test_file)
    if 'rf' not in sys.argv:
        reg_model = svr.train(X_train, t_train)
        y_pred = svr.test(reg_model, X_test)
    else:
        reg_model = rfr.train(X_train.todense(), t_train)
        y_pred = rfr.test(reg_model, X_test.todense())

    #for i in xrange(X_test.shape[0]):
    #	print y_pred[i], t_train[i]
    metrics.run_regression_metrics(t_test, y_pred)

    show_regression(y_pred, t_test)