def test(training_file, test_file,method="rf"): print "testing..." sys.stdout.write("%s:training... "%(strftime("%Y-%m-%d %H:%M:%S", gmtime()))) sys.stdout.flush() classifier=None if method=="gb": classifier = GBClassifier(training_file) elif method=="et": classifier = ETClassifier(training_file) elif method=='svm': svm.test(training_file,test_file) print "(%s) DONE." % (strftime("%Y-%m-%d %H:%M:%S", gmtime())) sys.exit(0) else: classifier = RFClassifier(training_file,100) print "(%s) DONE." % (strftime("%Y-%m-%d %H:%M:%S", gmtime())) #train_classifier(training_file) my_data = genfromtxt(test_file, delimiter='\t',skip_header=0) n_col = my_data.shape[1] n_features=n_col-1 #assuming that the latest column #contains the the outputs #for testing X = (np.hsplit(my_data,[n_features,n_col])[0]) Y = np.squeeze(np.asarray(np.hsplit(my_data,[n_features,n_col])[1])) predictions = classifier.predict(X) #compute classification accurancy if (np.unique(Y).size==2): #auc and roc for binary classification fpr, tpr, thresholds = metrics.roc_curve(Y, predictions) print "auc/roc report: " print fpr, tpr, metrics.auc(fpr, tpr), thresholds print "full classification report: " print metrics.classification_report(Y,predictions) print "report for the rarest class: " print metrics.classification_report(Y,predictions,labels=[1]) else: print 'nDCG' head_list_limit=None print compute_nDCG(Y, predictions,head_list_limit) print 'nDCG 2' print compute_nDCG_2(Y, predictions,head_list_limit)#,108801,28032 #precision for multi-class (results between 0-1) print "precision score: "+str(metrics.precision_score(Y,predictions,None,None,average='weighted')) print "full classification report: " print metrics.classification_report(Y,predictions) print "report for the rarest class: " print metrics.classification_report(Y,predictions,labels=[1])
def evaluate_one_doc(clf_name, clf, phrases, features, true_keys, N=10): pred_idx = [] if clf_name == 'NB': pred_idx = NB.test(clf, N, features) if clf_name == 'svm': pred_idx= svm.test(clf, N, features) pred_keys = [] print "# pred_keys", len(pred_keys) # get top N pred keys for idx in pred_idx: pred_keys.append(phrases[idx]) ### print "--pred_keys:" print pred_keys print "--true keys:" print true_keys ### precision = get_precision(true_keys, pred_keys) recall = get_recall(true_keys, pred_keys) return precision, recall
SVM = svm.train(gama = 0.001, descriptor_name = arguments.descriptor, model_name = 'SVM') # Print print('Done!\n') # Print print('Testing Support Vector Machine Model\n') # Test SVM Model print('Testing %s SVM Model\n' % arguments.descriptor, file = globals.file) # SVM Model SVM_predict = svm.test(model = SVM, descriptor_name = arguments.descriptor, model_name = 'SVM') # Print print('Done!\n') # Print print('Classification Report\n') # Print print('Classification Report\n', file = globals.file) # SVM Model svm.classificationReport(model = SVM, predict = SVM_predict, descriptor_name = arguments.descriptor,
def bootstrapping(B, X, y, C): accuracy = np.zeros(B) precision = np.zeros(B) recall = np.zeros(B) specificity = np.zeros(B) n, d = X.shape bs_err = np.zeros(B) for b in range(B): train_samples = list(np.random.randint(0, n, n)) test_samples = list(set(range(n)) - set(train_samples)) # train the model theta = svm.train(X[train_samples], y[train_samples], C) testSet = X[test_samples] testLabels = y[test_samples] n2, d2 = testSet.shape tp = 0 tn = 0 fp = 0 fn = 0 for j in xrange(n2): # extract the test point and test label test_point = testSet[j, :].T test_label = testLabels[j] # count if the test was good or not # test the model testResult = svm.test(theta, test_point) if testResult == 1 and test_label == 1: tp += 1 if testResult == 1 and test_label == -1: fp += 1 if testResult == -1 and test_label == 1: fn += 1 if testResult == -1 and test_label == -1: tn += 1 #print 'tp, tn, fp, fn' #print tp, tn, fp, fn #print '' try: accuracy[b] = float(tp + tn) / float(fn + fp + tp + tn) except ZeroDivisionError: accuracy[b] = 0.0 try: recall[b] = float(tp) / float(tp + fn) except ZeroDivisionError: recall[b] = 0.0 try: precision[b] = float(tp) / float(tp + fp) except ZeroDivisionError: precision[b] = 0.0 try: specificity[b] = float(tn) / float(tn + fp) except ZeroDivisionError: specificity[b] = 0.0 error = np.ones(B) error -= accuracy return accuracy, error, recall, precision, specificity return bs_err
test_x = [numpy.array([float(i) for i in x[2:]]) for x in temp_data[b:]] test_y = [1.0 if x[1] == 'M' else -1.0 for x in temp_data[b:]] return train_x, train_y, test_x, test_y def main(): f = open('wdbc.data') lines = f.readlines() data = [x for x in lines] return data data = main() train_x, train_y, test_x, test_y = split(data, 0.4) c = svm.get_c(train_x, train_y) tsvm = svm.train(train_x, train_y, c) psvm, rsvm = svm.test(test_x, test_y, tsvm) esvm = 2 * psvm * rsvm / (psvm + psvm) print("svm:") print("\tF1 %.3f " %esvm) print("\tprecision %.3f, recall %.3f" %(psvm, rsvm)) tp = perceptrone.train(train_x, train_y) pp, rp = perceptrone.test(test_x, test_y, tp) ep = 2 * pp * rp / (pp + rp) print("lp:") print("\tF1 %.3f " %ep) print("\tprecision %.3f, recall %.3f" %(pp, rp)) c = svm_smo.get_c(train_x, train_y, kernels.poly)
data = main() train_x, train_y, test_x, test_y = split(data, 0.4) c = nw.get_c(train_x, train_y) w1, w2 = nw.train(train_x, train_y, c) pnw, rnw = nw.test(test_x, test_y, w1, w2) enw = 2 * pnw * rnw / (pnw + rnw) print("nw:") print("\tF1 %.3f " % enw) print("\tprecision %.3f, recall %.3f" % (pnw, rnw)) c = svm.get_c(train_x, train_y) tsvm = svm.train(train_x, train_y, c) psvm, rsvm = svm.test(test_x, test_y, tsvm) esvm = 2 * psvm * rsvm / (psvm + psvm) print("svm:") print("\tF1 %.3f " % esvm) print("\tprecision %.3f, recall %.3f" % (psvm, rsvm)) tp = perceptrone.train(train_x, train_y) pp, rp = perceptrone.test(test_x, test_y, tp) ep = 2 * pp * rp / (pp + rp) print("lp:") print("\tF1 %.3f " % ep) print("\tprecision %.3f, recall %.3f" % (pp, rp)) c = svm_smo.get_c(train_x, train_y, kernels.poly)
#generate traing data x = np.concatenate((train_data_1,train_data_2), axis=0) y = np.concatenate((train_label_1,train_label_2), axis=0) #traning a model svm_rbf = svm.train(x,y,"rbf") svm_linear = svm.train(x,y,"linear") w,mean = linear.train(x,y) #generate test data test_data = np.concatenate((test_data_1,test_data_2), axis=0) test_label = np.concatenate((test_label_1,test_label_2), axis=0) #prediction svm_rbf_label = svm.test(test_data,svm_rbf) linear_label = linear.test(test_data,w,mean) svm_linear_label = svm.test(test_data,svm_linear) #get result svm_rbf_error = error_rate(test_label,svm_rbf_label) linear_error = error_rate(test_label,linear_label) svm_linear_error = error_rate(test_label,svm_linear_label) mle_error = testing_mle.testing(train_data_1,test_data_1,train_data_2,test_data_2) parzen_error = testing_parzen.testing(train_data_1,test_data_1,train_data_2,test_data_2,3) print "svm error(rbf):",svm_rbf_error print "svm error(linear):",svm_linear_error print "linaer_classifer error:",linear_error print "mle error:",mle_error print "parzen error:",parzen_error
return numpy.array(features) def kim_tfidf_ngrams(filename): return uni_features, bi_features def many_sentiment(filename): return sentiment.get_sentiment_counts(filename) if __name__ == "__main__": train_file = '/home/ak/Courses/cs73/project/dataset/small_train.txt' kim = kim_pos(train_file) # 5 features zhang = zhang_pos(train_file) # 7 features sent = many_sentiment(train_file) # 2 features X_train = numpy.hstack((kim, zhang, sent)) t_train = svm.compile_targets(train_file) model = svm.train(X_train, t_train) test_file = '/home/ak/Courses/cs73/project/dataset/small_test.txt' kim = kim_pos(test_file) # 5 features zhang = zhang_pos(test_file) # 7 features sent = many_sentiment(test_file) # 2 features X_test = numpy.hstack((kim, zhang, sent)) t_test = svm.compile_targets(test_file) y_pred = svm.test(model, X_test) metrics.run_classification_metrics(t_test, y_pred)
def cross_validation(X, y, foldcount, C): accuracy = np.zeros(foldcount) precision = np.zeros(foldcount) recall = np.zeros(foldcount) specificity = np.zeros(foldcount) n, d = X.shape # extract k folds from the data split = cross_validation_split(y, foldcount) # running k fold x validation for j in xrange(foldcount): # breaking up the folds into train and test trainInd = [] testInd = split[j] for i in xrange(foldcount): if j == i: continue trainInd += split[i] # construct the training and testing sets trainSet = X[trainInd] trainLabels = y[trainInd] testSet = X[testInd] testLabels = y[testInd] # train the model theta = svm.train(trainSet, trainLabels, C) n = len(testInd) # Matt is terrible # getting information on the statistical results tp = 0 tn = 0 fp = 0 fn = 0 for i in xrange(n): # extract the test point and test label test_point = testSet[i] test_label = testLabels[i] # count if the test was good or not # test the model testResult = svm.test(theta, test_point) if testResult == 1 and test_label == 1: tp += 1 if testResult == 1 and test_label == -1: fp += 1 if testResult == -1 and test_label == 1: fn += 1 if testResult == -1 and test_label == -1: tn += 1 # making sure there are no zero denominators # probably unnecessary but just in case #print 'tp, tn, fp, fn' #print tp, tn, fp, fn #print '' try: accuracy[j] = float(tp + tn) / float(fn + fp + tp + tn) except ZeroDivisionError: accuracy[j] = 0.0 try: recall[j] = float(tp) / float(tp + fn) except ZeroDivisionError: recall[j] = 0.0 try: precision[j] = float(tp) / float(tp + fp) except ZeroDivisionError: precision[j] = 0.0 try: specificity[j] = float(tn) / float(tn + fp) except ZeroDivisionError: specificity[j] = 0.0 error = np.ones(foldcount) error -= accuracy return accuracy, error, recall, precision, specificity
def main(): train_file = '/home/ak/Courses/cs73/project/dataset/small_train.txt' test_file = '/home/ak/Courses/cs73/project/dataset/small_test.txt' sent_included = False train_feats = [] test_feats = [] if 'k' in sys.argv: kim_train, kim_test = kim_features(train_file, test_file) train_feats.append(kim_train) test_feats.append(kim_test) if not sent_included: train_feats.append(many_sentiment(train_file)) test_feats.append(many_sentiment(test_file)) sent_included = True if 'o' in sys.argv: train_feats.append(omahony_features(train_file)) test_feats.append(omahony_features(test_file)) if not sent_included: train_feats.append(many_sentiment(train_file)) test_feats.append(many_sentiment(test_file)) sent_included = True if 'l' in sys.argv: train_feats.append(liu_features(train_file)) test_feats.append(liu_features(test_file)) if not sent_included: train_feats.append(many_sentiment(train_file)) test_feats.append(many_sentiment(test_file)) sent_included = True if 'z' in sys.argv: train_feats.append(zhang_features(train_file)) test_feats.append(zhang_features(test_file)) sent_included = True if not sent_included: train_feats.append(many_sentiment(train_file)) test_feats.append(many_sentiment(test_file)) sent_included = True if 't' in sys.argv: tfidf_train, tfidf_test = tfidf_ngrams(train_file, test_file, with_lsi=False) train_feats.append(tfidf_train) test_feats.append(tfidf_test) if 's' in sys.argv: train_feats.append(many_sentiment(train_file)) test_feats.append(many_sentiment(test_file)) if 'tl' in sys.argv: tfidf_train, tfidf_test = tfidf_ngrams(train_file, test_file, with_lsi=True) train_feats.append(tfidf_train) test_feats.append(tfidf_test) if 'bp' in sys.argv: train_feats.append(kim_pos(train_file)) test_feats.append(kim_pos(test_file)) X_train = None X_test = None if len(train_feats) > 1: X_train = scipy.sparse.hstack(train_feats) X_test = scipy.sparse.hstack(test_feats) else: X_train = train_feats[0] X_test = test_feats[0] svm.normalize(X_train) svm.normalize(X_test) # Classification # SV t_train_thresh = svm.compile_targets(train_file) t_test_thresh = svm.compile_targets(test_file) clf = ExtraTreesClassifier() X_new = clf.fit(X_train.toarray(), t_train_thresh).transform(X_train) if clf.feature_importances_.shape[0] < 500: for i in xrange(clf.feature_importances_.shape[0]): print i, clf.feature_importances_[i] '''bsvm = SVC(kernel="linear") selector = RFECV(bsvm, step=10) selector.fit(X_train, t_train_thresh) print selector.support_ print selector.ranking_ raw_input()''' class_model = None y_pred = None if 'rf' not in sys.argv: class_model = svm.train(X_train, t_train_thresh) y_pred = svm.test(class_model, X_test) else: class_model = rfc.train(X_train.todense(), t_train_thresh) y_pred = rfc.test(class_model, X_test.todense()) metrics.run_classification_metrics(t_test_thresh, y_pred) print # Regression # SVR t_train = svr.compile_targets(train_file) t_test = svr.compile_targets(test_file) if 'rf' not in sys.argv: reg_model = svr.train(X_train, t_train) y_pred = svr.test(reg_model, X_test) else: reg_model = rfr.train(X_train.todense(), t_train) y_pred = rfr.test(reg_model, X_test.todense()) #for i in xrange(X_test.shape[0]): # print y_pred[i], t_train[i] metrics.run_regression_metrics(t_test, y_pred) show_regression(y_pred, t_test)