def baseline(tweets_train, train_labels, tweets_test, test_labels): # Import the subjectivity lexicon subj_dict = data_proc.get_subj_lexicon() types_of_features = ['1', '2', '3', 'ngrams'] for t in types_of_features: start = time.time() utils.print_model_title("Classification using feature type " + t) if t is '1': x_train_features = extract_baseline_features.get_features1( tweets_train, subj_dict) x_test_features = extract_baseline_features.get_features1( tweets_test, subj_dict) if t is '2': x_train_features = extract_baseline_features.get_features2( tweets_train, subj_dict) x_test_features = extract_baseline_features.get_features2( tweets_test, subj_dict) if t is '3': x_train_features = extract_baseline_features.get_features3( tweets_train, subj_dict) x_test_features = extract_baseline_features.get_features3( tweets_test, subj_dict) if t is 'ngrams': ngram_map, x_train_features = extract_baseline_features.get_ngram_features( tweets_train, n=1) x_test_features = extract_baseline_features.get_ngram_features_from_map( tweets_test, ngram_map, n=1) # Get the class ratio class_ratio = utils.get_classes_ratio_as_dict(train_labels) # Train on a Linear Support Vector Classifier print("\nEvaluating a linear SVM model...") classifiers.linear_svm(x_train_features, train_labels, x_test_features, test_labels, class_ratio) # Train on a Logistic Regression Classifier print("\nEvaluating a logistic regression model...") classifiers.logistic_regression(x_train_features, train_labels, x_test_features, test_labels, class_ratio) end = time.time() print( "Completion time of the baseline model with features type %s: %.3f s = %.3f min" % (t, (end - start), (end - start) / 60.0))
def run_classifiers_with_doc2vec(reviews, scores, lang='en'): '''Corpus should be an array of TaggedDocument objects.''' corpus = list(get_corpus(reviews, scores, lang))[:20000] train_corpus, test_corpus = train_test_split(corpus, test_size=0.25, random_state=42) doc2vec_model = create_doc2vec_model(train_corpus) train_targets, train_regressors = zip(*[(doc.words, doc.tags[0]) for doc in train_corpus]) test_targets, test_regressors = zip(*[(doc.words, doc.tags[0]) for doc in test_corpus]) ''' For every review, we apply doc2vec_model.infer_vector(review). This creates a feature vector for every document (in our case, review) in the corpus. ''' train_x, train_y = get_train_lists(doc2vec_model, train_targets, train_regressors) test_x, test_y = get_test_lists(doc2vec_model, test_targets, test_regressors) classifier = classifiers.logistic_regression(train_x, train_y) # classifier = classifier_func(train_x, train_y) # return logistic_reg train_predictions = classifier.predict(train_x) # Training train_accuracy = metrics.accuracy_score(train_y, train_predictions) class_probabilities_train = classifier.predict_proba(train_x) train_auc_score = metrics.roc_auc_score(train_y, class_probabilities_train[:, 1]); print('\nTraining:') print(' accuracy:', format(100 * train_accuracy, '.2f')) print(' AUC value:', format(100 * train_auc_score, '.2f')) test_predictions = classifier.predict(test_x) # Test test_accuracy = metrics.accuracy_score(test_y, test_predictions) class_probabilities_test = classifier.predict_proba(test_x) test_auc_score = metrics.roc_auc_score(test_y, class_probabilities_test[:, 1]); print('\nTesting:') print(' accuracy:', format(100 * test_accuracy, '.2f')) print(' AUC value:', format(100 * test_auc_score, '.2f'))
def perform_cross_validation_split(data, num_splits, filename): y = data['defect_status'] del data["defect_status"] # removing defect answer for training X = data.to_numpy() # remove defects # performing cross-validation kf = KFold(n_splits=num_splits) for train, test in kf.split(X): X_train, X_test = X[train], X[test] y_train, y_test = y[train], y[test] actual = y_test.tolist() # Classification and Regression Tree (CART) prediction = classifiers.cart(X_train, y_train, X_test) calculate_metrics(prediction, actual, "cart") # KNearestNeighbor(KNN) prediction = classifiers.knn(X_train, y_train, X_test) calculate_metrics(prediction, actual, "knn") # Logistic Regression (LR) prediction = classifiers.logistic_regression(X_train, y_train, X_test) calculate_metrics(prediction, actual, "lr") # Naive Bayes (NB) prediction = classifiers.naive_bayes(X_train, y_train, X_test) calculate_metrics(prediction, actual, "nb") # Random Forest (RF) prediction = classifiers.random_forest(X_train, y_train, X_test) calculate_metrics(prediction, actual, "rf") print("\nCalculate mean metrics...") filename = filename.split("IST_")[1].split(".csv")[0] precision, recall, f_score, auc = calculate_mean_metrics( precision_results_cart, recall_results_cart, f_score_results_cart, auc_results_cart) save_results(precision, recall, f_score, auc, filename, "CART") precision, recall, f_score, auc = calculate_mean_metrics( precision_results_knn, recall_results_knn, f_score_results_knn, auc_results_knn) save_results(precision, recall, f_score, auc, filename, "KNN") precision, recall, f_score, auc = calculate_mean_metrics( precision_results_lr, recall_results_lr, f_score_results_lr, auc_results_lr) save_results(precision, recall, f_score, auc, filename, "LR") precision, recall, f_score, auc = calculate_mean_metrics( precision_results_nb, recall_results_nb, f_score_results_nb, auc_results_nb) save_results(precision, recall, f_score, auc, filename, "NB") precision, recall, f_score, auc = calculate_mean_metrics( precision_results_rf, recall_results_rf, f_score_results_rf, auc_results_rf) save_results(precision, recall, f_score, auc, filename, "RF")
def baseline(tweets_train, train_labels, tweets_test, test_labels): subj_dict = dproc.get_subj_lexicon('hindi_lexicon.tff') types_of_features = ['1', '2', 'ngrams'] # '3' is removed for t in types_of_features: start = time.time() utils.print_model_title("Classification using features type " + t) if t is '1': x_train_features = extract_baseline_features.get_features1( tweets_train, subj_dict) x_test_features = extract_baseline_features.get_features1( tweets_test, subj_dict) if t is '2': x_train_features = extract_baseline_features.get_features2( tweets_train, subj_dict) x_test_features = extract_baseline_features.get_features2( tweets_test, subj_dict) #if t is '3': # x_train_features = extract_baseline_features.get_feature3(tweets_train, subj_dict) # x_test_features = extract_baseline_features.get_feature3(tweets_test, subj_dict) if t is 'ngrams': ngram_map, x_train_features = extract_baseline_features.get_ngram_features( tweets_train, n=1) x_test_features = extract_baseline_features.get_ngram_features_from_map( tweets_test, ngram_map, n=1) #get the class ratio class_ratio = utils.get_classes_ratio_as_dict(train_labels) # train on a linear Support Vector Classifer print('\n Evaluating a linear SVM model...') classifiers.linear_svm(x_train_features, train_labels, x_test_features, test_labels, class_ratio) #train on logistic regression classifiers.logistic_regression(x_train_features, train_labels, x_test_features, test_labels, class_ratio) end = time.time() print( "Completion time of the baseline model with features type %s: %.3f s = %.3f min" % (t, (end - start), (end - start) / 60.0))
def supperclassify(train_set, train_label, test_set, test_label): '''Different methods''' train_voted = voting(train_set) aux = train_voted == train_label correct = sum(aux.astype(int)) _accuracy = (correct * 100) / len(train_label) _precision, _recall, _f1score, _support = ut.get_measures_for_each_class( train_label, train_voted) print 'Estimator VOTING' print 'Average Accuracy:\t', _accuracy print 'Average Precision:\t', _precision print 'Average Recall:\t', _recall print 'Average F1 Measure:\t', _f1score print '\n' lambdas = weighted_voting_getlambdas(train_set, train_label) results = weighted_voting(test_set, lambdas) aux = results == test_label correct = sum(aux.astype(int)) _accuracy = (correct * 100) / len(test_label) _precision, _recall, _f1score, _support = ut.get_measures_for_each_class( test_label, results) print 'Estimator W_VOTING' print 'Average Accuracy:\t', _accuracy print 'Average Precision:\t', _precision print 'Average Recall:\t', _recall print 'Average F1 Measure:\t', _f1score rf = clf.classifier_randomForest(train_set, train_label) results = clf.evaluateResults(rf, test_set, test_label, estimator_name='RF') lr = clf.logistic_regression(train_set, train_label) results = clf.evaluateResults(lr, test_set, test_label, estimator_name='LR') svm = clf.classifier_svm(train_set, train_label) results = clf.evaluateResults(svm, test_set, test_label, estimator_name='SVM') rbf = clf.rbf_classifier(train_set, train_label) results = clf.evaluateResults(rbf, test_set, test_label, estimator_name='RBF')
def supperclassify(train_set, train_label, test_set, test_label): '''Different methods''' train_voted = voting(train_set) aux = train_voted == train_label correct = sum(aux.astype(int)) _accuracy = (correct * 100) / len(train_label) _precision, _recall, _f1score, _support = ut.get_measures_for_each_class(train_label, train_voted) print 'Estimator VOTING' print 'Average Accuracy:\t', _accuracy print 'Average Precision:\t', _precision print 'Average Recall:\t', _recall print 'Average F1 Measure:\t', _f1score print '\n' lambdas = weighted_voting_getlambdas(train_set, train_label) results = weighted_voting(test_set, lambdas) aux = results == test_label correct = sum(aux.astype(int)) _accuracy = (correct * 100) / len(test_label) _precision, _recall, _f1score, _support = ut.get_measures_for_each_class(test_label, results) print 'Estimator W_VOTING' print 'Average Accuracy:\t', _accuracy print 'Average Precision:\t', _precision print 'Average Recall:\t', _recall print 'Average F1 Measure:\t', _f1score rf = clf.classifier_randomForest(train_set, train_label) results = clf.evaluateResults(rf, test_set, test_label, estimator_name='RF') lr = clf.logistic_regression(train_set, train_label) results = clf.evaluateResults(lr, test_set, test_label, estimator_name='LR') svm = clf.classifier_svm(train_set, train_label) results = clf.evaluateResults(svm, test_set, test_label, estimator_name='SVM') rbf = clf.rbf_classifier(train_set, train_label) results = clf.evaluateResults(rbf, test_set, test_label, estimator_name='RBF') # import pdb; pdb.set_trace()
random.shuffle(train_set) Y_train = np.array([row[0] for row in train_set]) X_train = np.array([row[1] for row in train_set]) test_set = [] for key, value in test.items(): for file_id, words in value.items(): bow = np.zeros(len(vocab)) for word in words: if word in vocab: bow[vocab[word]] += 1 else: bow[vocab['</unk>']] += 1 tf = bow/len(words) tfidf = tf*idf_test avg_glove = np.zeros(300) for i in range(len(vocab)): avg_glove += tfidf[i]*glove[i] test_set.append([key, avg_glove]) print('shuffling test') random.shuffle(test_set) Y_test = np.array([row[0] for row in test_set]) X_test = np.array([row[1] for row in test_set]) print("Calling Classifiers\n") print("Accuracy for Naive Bayes is : ", naive_bayes(X_train, Y_train, X_test, Y_test)) print("Accuracy for Logistic Regression is : ", logistic_regression(X_train, Y_train, X_test, Y_test)) print("Accuracy for SVM is : ", svm(X_train, Y_train, X_test, Y_test)) print("Accuracy for FF Neural Net is : ", fnn(X_train, Y_train, X_test, Y_test)) # print("Accuracy for Recurrent Neural Net is : ", rnn(X_train, Y_train, X_test, Y_t))
# Extract tweet partition train_tweets = tweets[np.array(train)] train_tweets, test_tweets, train_labels, test_labels = tweets[train], tweets[test], labels[train], labels[test] print len(test_tweets) print len(train_tweets) train_tweets = np.hstack(train_tweets) dictionary, tweets_features, vectorizer = bow.bow(train_tweets, vec="tfidf") ''' Training different classifiers. ''' svm = clf.classifier_svm(tweets_features, train_labels) rf = clf.classifier_randomForest(tweets_features, train_labels) ada = clf.adaboost(tweets_features, train_labels) lr = clf.logistic_regression(tweets_features, train_labels) ''' Test the different classifiers with the test tweets. ''' pred = vectorizer.transform(test_tweets) pred = pred.toarray() _results, _accuracyLR, _precisionLR, _recallLR, _f_measureLR = clf.evaluateResults(lr, pred, test_labels, estimator_name='Logistic regression', file_name=results_folder) _results, _accuracyRF, _precisionRF, _recallRF, _f_measureRF = clf.evaluateResults(rf, pred, test_labels, estimator_name='RF', file_name=results_folder)
reviews.append(review) # length_of_reviews.append(len(review)) return reviews, scores #, length_of_reviews if __name__ == '__main__': print("OP_SPAM.py") reviews, scores = parse_op_spam() print(len(reviews), len(scores)) bow, vec = bow.bag_of_words(reviews) # print(bow) train_x, test_x, train_y, test_y = train_test_split(bow, scores, test_size=0.25, random_state=42) classifier = classifiers.logistic_regression(train_x, train_y) train_predictions = classifier.predict(train_x) # Training train_accuracy = metrics.accuracy_score(train_y, train_predictions) class_probabilities_train = classifier.predict_proba(train_x) train_auc_score = metrics.roc_auc_score(train_y, class_probabilities_train[:, 1]); print('\nTraining:') print(' accuracy:', format(100 * train_accuracy, '.2f')) print(' AUC value:', format(100 * train_auc_score, '.2f')) test_predictions = classifier.predict(test_x) # Test test_accuracy = metrics.accuracy_score(test_y, test_predictions) class_probabilities_test = classifier.predict_proba(test_x) test_auc_score = metrics.roc_auc_score(test_y, class_probabilities_test[:, 1]); print('\nTesting:') print(' accuracy:', format(100 * test_accuracy, '.2f'))
def main(): # Decision Tree dtree_scores = decision_tree(5) X, Y = extract_scores(dtree_scores) tree_max = max(Y) plt.plot(X, Y, 'r') plt.axis([8, 22, 0.94, 0.98]) plt.title('Decision Tree Classifier Accuracy') plt.xlabel('max_depth') plt.ylabel('accuracy') plt.savefig('dtree.png') plt.clf() # Logistic Regression log_scores = logistic_regression(5) X, Y = extract_scores(log_scores) log_max = max(Y) plt.plot(X, Y, 'r') plt.axis([0, 1, 0.65, 0.8]) plt.title('Logistic Regression Accuracy') plt.xlabel('alpha') plt.ylabel('accuracy') plt.savefig('log.png') plt.clf() # kNN k_scores = kNN(5) X, Y = extract_scores(k_scores) k_max = max(Y) plt.plot(X, Y, 'r') plt.axis([0, 15, 0.95, 1.0]) plt.title('kNN Accuracy') plt.xlabel('neighbors') plt.ylabel('accuracy') plt.savefig('knn.png') plt.clf() # neural network net_scores = neural_network(5) X, Y = extract_scores(net_scores) net_max = max(Y) plt.plot(X, Y, 'r') plt.axis([0, 1.5, 0.95, 1.0]) plt.title('Neural Network Accuracy') plt.xlabel('alpha') plt.ylabel('accuracy') plt.savefig('net.png') plt.clf() # Final Results best = [tree_max, k_max, net_max] models = ['DecisionTree', 'K-NN', 'NeuralNetwork'] x_pos = [i for i, _ in enumerate(models)] plt.bar(x_pos, best, color='green') plt.ylim(0.95, 1.0) plt.xlabel("Classifiers") plt.ylabel("Accuracy") plt.title("Best Accuracy from Models") plt.xticks(x_pos, models) plt.savefig('best.png')
print('shuffling train') random.shuffle(train_set) Y_train = np.array([row[0] for row in train_set]) X_train = np.array([row[1] for row in train_set]) test_set = [] for key, value in test.items(): for file_id, words in value.items(): bow = np.zeros(len(vocab)) for word in words: if word in vocab: bow[vocab[word]] += 1 else: bow[vocab['</unk>']] += 1 bow = bow / len(words) test_set.append([key, bow]) print('shuffling test') random.shuffle(test_set) Y_test = np.array([row[0] for row in test_set]) X_test = np.array([row[1] for row in test_set]) print("Calling Classifiers\n") print("Accuracy for Naive Bayes is : ", naive_bayes(X_train, Y_train, X_test, Y_test)) print("Accuracy for Logistic Regression is : ", logistic_regression(X_train, Y_train, X_test, Y_test)) print("Accuracy for SVM is : ", svm(X_train, Y_train, X_test, Y_test)) print("Accuracy for FF Neural Net is : ", fnn(X_train, Y_train, X_test, Y_test)) # print("Accuracy for Recurrent Neural Net is : ", rnn(X_train, Y_train, X_test, Y_t))
train_tweets, test_tweets, train_labels, test_labels = tweets[ train], tweets[test], labels[train], labels[test] print len(test_tweets) print len(train_tweets) train_tweets = np.hstack(train_tweets) dictionary, tweets_features, vectorizer = bow.bow(train_tweets, vec="tfidf") ''' Training different classifiers. ''' svm = clf.classifier_svm(tweets_features, train_labels) rf = clf.classifier_randomForest(tweets_features, train_labels) ada = clf.adaboost(tweets_features, train_labels) lr = clf.logistic_regression(tweets_features, train_labels) ''' Test the different classifiers with the test tweets. ''' pred = vectorizer.transform(test_tweets) pred = pred.toarray() _results, _accuracyLR, _precisionLR, _recallLR, _f_measureLR = clf.evaluateResults( lr, pred, test_labels, estimator_name='Logistic regression', file_name=results_folder) _results, _accuracyRF, _precisionRF, _recallRF, _f_measureRF = clf.evaluateResults( rf,
# print('Random Forest:') # tree_count = clf.test_random_forest(analysis, 50, 10, True) # print('Optimal Tree Count: {}.'.format(tree_count)) results = [] for i in np.round(np.linspace(0, 0.9, 10), 1).tolist(): print('Noise Amount: {}'.format(i)) # Preprocess analysis = pr.Analyser(blur, oasis, i) # analysis.get_summaries() analysis.train_test() analysis.get_tfidf() # Fit logistic regression print('Logistic Regression:') logit_acc, logit_prec, logit_rec = clf.logistic_regression( analysis, 10) print('Accuracy: {}% +/-{}\nPrecsion: {}% +/- {}\nRecall: {}% +/- {}'. format(np.round(logit_acc[0] * 100, 2), np.round(100 * logit_acc[1], 2), np.round(100 * logit_prec[0], 2), np.round(100 * logit_prec[1], 2), np.round(100 * logit_rec[0], 2), np.round(100 * logit_rec[1], 2))) result = ['logistic', i] result.extend(logit_acc + logit_prec + logit_rec) results.append(result) print('{}\n'.format('-' * 80)) # Fit Random Forest print('Random Forest:') rf_acc, rf_prec, rf_rec = clf.random_forest(analysis, 100, 10)