def test_classification_test_dataset(self): # ------------------------------------------------------------------------------- # Use the best method (perceptron, average perceptron or Pegasos) along with # the optimal hyperparameters according to validation accuracies to test # against the test dataset. The test data has been provided as # test_bow_features and test_labels. # ------------------------------------------------------------------------------- T = 25 L = 0.01 theta, theta_0 = p1.pegasos(feature_matrix=train_bow_features, labels=train_labels, T=T, L=L) pred_labels = p1.classify(test_bow_features, theta, theta_0) accuracy = p1.accuracy(pred_labels, test_labels) print(f'Accuracy on test data : {accuracy}') # ------------------------------------------------------------------------------- # Assign to best_theta, the weights (and not the bias!) learned by your most # accurate algorithm with the optimal choice of hyperparameters. # ------------------------------------------------------------------------------- best_theta = theta wordlist = [word for (idx, word) in sorted(zip(dictionary.values(), dictionary.keys()))] sorted_word_features = utils.most_explanatory_word(best_theta, wordlist) print("Most Explanatory Word Features") print(sorted_word_features[:10]) print("Least Explanatory Word Features") print(sorted_word_features[-10:]) return
for i, text in enumerate(reviews): word_list = extract_words(text) for word in word_list: if word in dictionary: feature_matrix[i, dictionary[word]] = word_list.count(word) return feature_matrix #pragma: coderesponse end train_data = utils.load_data('reviews_train.tsv') train_texts, train_labels = zip(*((sample['text'], sample['sentiment']) for sample in train_data)) dictionary = bag_of_words(train_texts) train_bow_features = extract_bow_feature_vectors(train_texts, dictionary) theta, theta0 = pegasos(train_bow_features, train_labels, T=25, L=0.01) wordlist = [word for (idx, word) in sorted(zip(dictionary.values(), dictionary.keys()))] sorted_word_features = utils.most_explanatory_word(theta, wordlist) print(" *** Most Positive Word Features ***") print(sorted_word_features[0:20]) print(" *** Most Negative Word Features ***") print(sorted_word_features[-20:-1]) x = np.repeat([-1, 1], 10) y = np.tile(np.arange(1,11), 2) fig, ax = plt.subplots() wlist = sorted_word_features[-11:-1] + sorted_word_features[0:10] colors = ['g' if label == 1 else 'r' for label in x] for i, word in enumerate(wlist): ax.scatter(x[i], y[i] , s=700*(len(word)) ,c =colors[i], \ marker=r"$ {} $".format(word), edgecolors='none') ax.spines['left'].set_position('zero') ax.spines['right'].set_color('none')
#------------------------------------------------------------------------------- T=25 L=0.0100 avg_peg_train_accuracy, avg_peg_val_accuracy = \ p1.classifier_accuracy(p1.pegasos, train_bow_features,test_bow_features,train_labels,test_labels,T=T,L=L) print("{:50} {:.4f}".format("Training accuracy for Pegasos:", avg_peg_train_accuracy)) print("{:50} {:.4f}".format("Validation accuracy for Pegasos:", avg_peg_val_accuracy)) thetas_pegasos = p1.pegasos(train_bow_features, train_labels, T, L) # print(thetas_pegasos) #------------------------------------------------------------------------------- # Assign to best_theta, the weights (and not the bias!) learned by your most # accurate algorithm with the optimal choice of hyperparameters. #------------------------------------------------------------------------------- best_theta =thetas_pegasos[0] wordlist = [word for (idx, word) in sorted(zip(dictionary.values(), dictionary.keys()))] sorted_word_features = utils.most_explanatory_word(best_theta, p1.bag_of_words(wordlist)) print("Most Explanatory Word Features") print(sorted_word_features[:10]) #-------------------------------------------------------------------------------
print "(train accuracy, test accuracy) before modification" print p1.average_passive_aggressive_accuracy(train_bow_features,test_bow_features,train_labels,test_labels,T,L) #------------------------------------------------------------------------------- # #------------------------------------------------------------------------------- # # # Assign to best_theta, the weights (and not the bias!) learned by the most # accurate algorithm with the optimal choice of hyperparameters. #------------------------------------------------------------------------------- best_theta = p1.average_passive_aggressive(test_bow_features, test_labels, best_T, best_L)[0] wordlist = [word for (idx, word) in sorted(zip(dictionary.values(), dictionary.keys()))] sorted_word_features = utils.most_explanatory_word(best_theta, wordlist) print("Most Explanatory Word Features") print(sorted_word_features[:10]) #------------------------------------------------------------------------------- # #------------------------------------------------------------------------------- # # # Assessing performance on the validation set. # #------------------------------------------------------------------------------- dictionary_mod = p1.modified_bag_of_words(train_texts) train_final_features = p1.extract_final_features(train_texts, dictionary_mod) val_final_features = p1.extract_final_features(val_texts, dictionary_mod)
def problem9b(T = 25, L = 0.01): best_theta = p1.pegasos(train_bow_features, train_labels, T, L)[0] wordlist = [word for (idx, word) in sorted(zip(dictionary.values(), dictionary.keys()))] sorted_word_features = utils.most_explanatory_word(best_theta, wordlist) print("Most Explanatory Word Features") print(sorted_word_features[:10])