def main(): train_messages, train_labels = util.load_spam_dataset('spam_train.tsv') val_messages, val_labels = util.load_spam_dataset('spam_val.tsv') test_messages, test_labels = util.load_spam_dataset('spam_test.tsv') dictionary = create_dictionary(train_messages) print('Size of dictionary: ', len(dictionary)) train_matrix = transform_text(train_messages, dictionary) val_matrix = transform_text(val_messages, dictionary) test_matrix = transform_text(test_messages, dictionary) optimal_radius = compute_best_svm_radius(train_matrix, train_labels, val_matrix, val_labels, [0.01, 0.1, 1, 10]) print('The optimal SVM radius was {}'.format(optimal_radius)) svm_predictions = svm.train_and_predict_svm(train_matrix, train_labels, test_matrix, optimal_radius) np.savetxt('svm_predictions.txt', svm_predictions) svm_accuracy = np.mean(svm_predictions == test_labels) print('The SVM model had an accuracy of {} on the testing set'.format( svm_accuracy))
def main(): train_messages, train_labels = util.load_spam_dataset( '../data/ds6_train.tsv') val_messages, val_labels = util.load_spam_dataset('../data/ds6_val.tsv') test_messages, test_labels = util.load_spam_dataset('../data/ds6_test.tsv') dictionary = create_dictionary(train_messages) util.write_json('./output/p06_dictionary', dictionary) train_matrix = transform_text(train_messages, dictionary) np.savetxt('./output/p06_sample_train_matrix', train_matrix[:100, :]) val_matrix = transform_text(val_messages, dictionary) test_matrix = transform_text(test_messages, dictionary) naive_bayes_model = fit_naive_bayes_model(train_matrix, train_labels) naive_bayes_predictions = predict_from_naive_bayes_model( naive_bayes_model, test_matrix) np.savetxt('./output/p06_naive_bayes_predictions', naive_bayes_predictions) naive_bayes_accuracy = np.mean(naive_bayes_predictions == test_labels) print('Naive Bayes had an accuracy of {} on the testing set'.format( naive_bayes_accuracy)) top_5_words = get_top_five_naive_bayes_words(naive_bayes_model, dictionary) print('The top 5 indicative words for Naive Bayes are: ', top_5_words) util.write_json('./output/p06_top_indicative_words', top_5_words) """
def main(): train_messages, train_labels = util.load_spam_dataset('spam_train.tsv') val_messages, val_labels = util.load_spam_dataset('spam_val.tsv') test_messages, test_labels = util.load_spam_dataset('spam_test.tsv') ### Q3.1 ### dictionary = create_dictionary(train_messages) print('Size of dictionary: ', len(dictionary)) util.write_json('spam_dictionary', dictionary) train_matrix = transform_text(train_messages, dictionary) np.savetxt('spam_sample_train_matrix', train_matrix[:100, :]) val_matrix = transform_text(val_messages, dictionary) test_matrix = transform_text(test_messages, dictionary) ### Q3.2 ### naive_bayes_model = fit_naive_bayes_model(train_matrix, train_labels) naive_bayes_predictions = predict_from_naive_bayes_model( naive_bayes_model, test_matrix) np.savetxt('spam_naive_bayes_predictions', naive_bayes_predictions) naive_bayes_accuracy = np.mean(naive_bayes_predictions == test_labels) print('Naive Bayes had an accuracy of {} on the testing set'.format( naive_bayes_accuracy)) ### Q3.3 ### top_5_words = get_top_five_naive_bayes_words(naive_bayes_model, dictionary) print('The top 5 indicative words for Naive Bayes are: ', top_5_words) util.write_json('spam_top_indicative_words', top_5_words) ### Q3.4 ### optimal_radius = compute_best_svm_radius(train_matrix, train_labels, val_matrix, val_labels, [0.01, 0.1, 1, 10]) # optimal_radius = compute_best_svm_radius(train_matrix, train_labels, val_matrix, val_labels, [0.1]) util.write_json('spam_optimal_radius', optimal_radius) print('The optimal SVM radius was {}'.format(optimal_radius)) svm_predictions = svm.train_and_predict_svm(train_matrix, train_labels, test_matrix, optimal_radius) svm_accuracy = np.mean(svm_predictions == test_labels) print('The SVM model had an accuracy of {} on the testing set'.format( svm_accuracy, optimal_radius))
def experimenting_without_punctuation(): train_messages, train_labels = util.load_spam_dataset('spam_train.tsv') val_messages, val_labels = util.load_spam_dataset('spam_val.tsv') test_messages, test_labels = util.load_spam_dataset('spam_test.tsv') print( 'EXPERIMENT: WHAT HAPPENS WHEN WE DELETE PUNCTUATION FROM OUR MESSAGES' ) ### Q3.1 ### dictionary = create_dictionary(train_messages) print('Size of dictionary: ', len(dictionary)) train_matrix = transform_text(train_messages, dictionary) np.savetxt('spam_sample_train_matrix', train_matrix[:100, :]) val_matrix = transform_text(val_messages, dictionary) test_matrix = transform_text(test_messages, dictionary) ### Q3.2 ### naive_bayes_model = fit_naive_bayes_model(train_matrix, train_labels) naive_bayes_predictions = predict_from_naive_bayes_model( naive_bayes_model, test_matrix) np.savetxt('spam_naive_bayes_predictions', naive_bayes_predictions) naive_bayes_accuracy = np.mean(naive_bayes_predictions == test_labels) print('Naive Bayes had an accuracy of {} on the testing set'.format( naive_bayes_accuracy)) top_5_words = get_top_five_naive_bayes_words(naive_bayes_model, dictionary) print('The top 5 indicative words for Naive Bayes are: ', top_5_words) ### Q3.4 ### optimal_radius = compute_best_svm_radius(train_matrix, train_labels, val_matrix, val_labels, [0.01, 0.1, 1, 10]) print('The optimal SVM radius was {}'.format(optimal_radius)) svm_predictions = svm.train_and_predict_svm(train_matrix, train_labels, test_matrix, optimal_radius) svm_accuracy = np.mean(svm_predictions == test_labels) print('The SVM model had an accuracy of {} on the testing set'.format( svm_accuracy, optimal_radius))
def main(): train_messages, train_labels = util.load_spam_dataset( '../data/ds6_train.tsv') val_messages, val_labels = util.load_spam_dataset('../data/ds6_val.tsv') test_messages, test_labels = util.load_spam_dataset('../data/ds6_test.tsv') dictionary = create_dictionary(train_messages) util.write_json('./output/p06_dictionary', dictionary) train_matrix = transform_text(train_messages, dictionary) np.savetxt('./output/p06_sample_train_matrix', train_matrix[:100, :]) val_matrix = transform_text(val_messages, dictionary) test_matrix = transform_text(test_messages, dictionary) naive_bayes_model = fit_naive_bayes_model(train_matrix, train_labels) naive_bayes_predictions = predict_from_naive_bayes_model( naive_bayes_model, test_matrix) np.savetxt('./output/p06_naive_bayes_predictions', naive_bayes_predictions) naive_bayes_accuracy = np.mean(naive_bayes_predictions == test_labels) print('Naive Bayes had an accuracy of {} on the testing set'.format( naive_bayes_accuracy)) top_5_words = get_top_five_naive_bayes_words(naive_bayes_model, dictionary) print('The top 5 indicative words for Naive Bayes are: ', top_5_words) util.write_json('./output/p06_top_indicative_words', top_5_words) optimal_radius = compute_best_svm_radius(train_matrix, train_labels, val_matrix, val_labels, [0.01, 0.1, 1, 10]) util.write_json('./output/p06_optimal_radius', optimal_radius) print('The optimal SVM radius was {}'.format(optimal_radius)) svm_predictions = svm.train_and_predict_svm(train_matrix, train_labels, test_matrix, optimal_radius) svm_accuracy = np.mean(svm_predictions == test_labels) print('The SVM model had an accuracy of {} on the testing set'.format( svm_accuracy, optimal_radius)) end = time.time() print("Execution Time: ", end - start)
def main(): train_messages, train_labels = util.load_spam_dataset('spam_train.tsv') test_messages, test_labels = util.load_spam_dataset('spam_test.tsv') dictionary = create_dictionary(train_messages) print('Size of dictionary: ', len(dictionary)) train_matrix = transform_text(train_messages, dictionary) test_matrix = transform_text(test_messages, dictionary) gnb = GaussianNB() naive_bayes_predictions = gnb.fit(train_matrix, train_labels).predict(test_matrix) np.savetxt('vscikit-learn_spam_naive_bayes_predictions', naive_bayes_predictions) naive_bayes_accuracy = np.mean(naive_bayes_predictions == test_labels) print('Naive Bayes had an accuracy of {} on the testing set'.format( naive_bayes_accuracy))
def main(): train_messages, train_labels = util.load_spam_dataset('spam_train.tsv') test_messages, test_labels = util.load_spam_dataset('spam_test.tsv') dictionary = create_dictionary(train_messages) print('Size of dictionary: ', len(dictionary)) train_matrix = transform_text(train_messages, dictionary) test_matrix = transform_text(test_messages, dictionary) naive_bayes_model = fit_naive_bayes_model(train_matrix, train_labels) naive_bayes_predictions = predict_from_naive_bayes_model(naive_bayes_model, test_matrix) np.savetxt('spam_naive_bayes_predictions', naive_bayes_predictions) naive_bayes_accuracy = np.mean(naive_bayes_predictions == test_labels) print('Naive Bayes had an accuracy of {} on the testing set'.format(naive_bayes_accuracy)) top_5_words = get_top_five_naive_bayes_words(naive_bayes_model, dictionary) print('The top 5 indicative words for Naive Bayes are: ', top_5_words)
def main(): train_messages, train_labels = util.load_spam_dataset('spam_train.tsv') test_messages, test_labels = util.load_spam_dataset('spam_test.tsv') dictionary = create_dictionary(train_messages) print('Size of dictionary: ', len(dictionary)) train_matrix = transform_text(train_messages, dictionary) test_matrix = transform_text(test_messages, dictionary) clf = svm.SVC() clf.fit(train_matrix, train_labels) svm_predictions = clf.predict(test_matrix) np.savetxt('vscikit-learn_svm_predictions.txt', svm_predictions) svm_accuracy = np.mean(svm_predictions == test_labels) print('The SVM model had an accuracy of {} on the testing set'.format( svm_accuracy))
def main(train_path, test_path): train_messages, train_labels = util.load_spam_dataset(train_path) test_messages, test_labels = util.load_spam_dataset(test_path) dictionary = create_dictionary(train_messages) util.write_json('./output/p06_dictionary', dictionary) x_train = transform_text(train_messages, dictionary) x_test = transform_text(test_messages, dictionary) np.savetxt('./output/p06_sample_train_matrix', x_train[:100, :]) nb = NaiveBayes() nb.fit(x_train, train_labels) y_pred = nb.predict(x_test) np.savetxt('./output/p06_naive_bayes_predictions', y_pred) accuracy = np.mean(y_pred == test_labels) print('Naive Bayes had an accuracy of {} on the testing set'.format( accuracy)) top_5_words = nb.top_words(5, dictionary) print('The top 5 indicative words for Naive Bayes are: ', top_5_words)
def main(): train_messages, train_labels = util.load_spam_dataset('spam_train.tsv') val_messages, val_labels = util.load_spam_dataset('spam_val.tsv') test_messages, test_labels = util.load_spam_dataset('spam_test.tsv') dictionary = create_dictionary(train_messages) print('Size of dictionary: ', len(dictionary)) util.write_json('spam_dictionary', dictionary) train_matrix = transform_text(train_messages, dictionary) np.savetxt('spam_sample_train_matrix', train_matrix[:100,:]) val_matrix = transform_text(val_messages, dictionary) test_matrix = transform_text(test_messages, dictionary) naive_bayes_model = fit_naive_bayes_model(train_matrix, train_labels) naive_bayes_predictions = predict_from_naive_bayes_model(naive_bayes_model, test_matrix) np.savetxt('spam_naive_bayes_predictions', naive_bayes_predictions) naive_bayes_accuracy = np.mean(naive_bayes_predictions == test_labels) print('Naive Bayes had an accuracy of {} on the testing set'.format(naive_bayes_accuracy)) top_5_words = get_top_five_naive_bayes_words(naive_bayes_model, dictionary) print('The top 5 indicative words for Naive Bayes are: ', top_5_words) util.write_json('spam_top_indicative_words', top_5_words) optimal_radius = compute_best_svm_radius(train_matrix, train_labels, val_matrix, val_labels, [0.01, 0.1, 1, 10]) util.write_json('spam_optimal_radius', optimal_radius) print('The optimal SVM radius was {}'.format(optimal_radius)) svm_predictions = svm.train_and_predict_svm(train_matrix, train_labels, test_matrix, optimal_radius) svm_accuracy = np.mean(svm_predictions == test_labels) print('The SVM model had an accuracy of {} on the testing set'.format(svm_accuracy, optimal_radius)) train_matrix = util.load_bert_encoding('bert_train_matrix.tsv.bz2') val_matrix = util.load_bert_encoding('bert_val_matrix.tsv.bz2') test_matrix = util.load_bert_encoding('bert_test_matrix.tsv.bz2') best_learning_rate = compute_best_logreg_learning_rate(train_matrix, train_labels, val_matrix, val_labels, [0.01, 0.001, 0.0001, 0.00001, 0.000001]) print('The best learning rate for logistic regression is {}'.format(best_learning_rate)) logreg_predictions = logreg.train_and_predict_logreg(train_matrix, train_labels, test_matrix, best_learning_rate) logreg_accuracy = np.mean(logreg_predictions == test_labels) print('The Logistic Regression model with BERT encodings had an accuracy of {} on the testing set'.format(logreg_accuracy))
def main(): train_messages, train_labels = util.load_spam_dataset('spam_train.tsv') val_messages, val_labels = util.load_spam_dataset('spam_val.tsv') test_messages, test_labels = util.load_spam_dataset('spam_test.tsv') dictionary = create_dictionary(train_messages)