def run_permutation_test_bow_lowercase(train, test, kernel): train_Y = train['sentiment'].to_numpy() test_Y = test['sentiment'].to_numpy() bow_train_X, vectorizer = get_bow_vectors(train['review'].values, min_count=4, max_frac=0.5, frequency=False) bow_test_X, _ = get_bow_vectors(test['review'].values, vectorizer=vectorizer) low_bow_train_X, low_vectorizer = get_bow_vectors(train['review'].values, min_count=4, max_frac=0.5, lowercase=False, frequency=False) low_bow_test_X, _ = get_bow_vectors(test['review'].values, vectorizer=low_vectorizer) svm1 = build_svm_classifier(bow_train_X, train_Y, kernel=kernel, gamma='scale') svm2 = build_svm_classifier(low_bow_train_X, train_Y, kernel=kernel, gamma='scale') return permutation_test(test_Y, svm1.predict(bow_test_X), svm2.predict(low_bow_test_X))
def run_permutation_test_bow_vs_doc2vec(train, test, bow_kernel, doc2vec_kernel, doc2vec_train_X, doc2vec_train_Y, doc2vec_model): """ Run a permutation test to compare bow and doc2vec svms. """ # prepare data for both svms train_Y = train['sentiment'].to_numpy() test_Y = test['sentiment'].to_numpy() bow_train_X, vectorizer = get_bow_vectors(train['review'].values, min_count=4, max_frac=0.5, frequency=False) bow_test_X, _ = get_bow_vectors(test['review'].values, vectorizer=vectorizer) doc2vec_test_X = get_doc2vec_data(test['review'].values, doc2vec_model) # build models and predict bow_svm = build_svm_classifier(bow_train_X, train_Y, kernel=bow_kernel, gamma='scale') doc2vec_svm = build_svm_classifier(doc2vec_train_X, doc2vec_train_Y, kernel=doc2vec_kernel, gamma='scale') # delete big variables no longer used del bow_train_X return permutation_test(test_Y, bow_svm.predict(bow_test_X), doc2vec_svm.predict(doc2vec_test_X))
def run_permutation_test_bow_with_uni_and_bigrams(train, test, kernel): train_Y = train['sentiment'].to_numpy() test_Y = test['sentiment'].to_numpy() bi_train_X, bi_vectorizer = get_bow_vectors(train['review'].values, min_count=7, max_frac=0.5, frequency=False, bigrams=True) bi_test_X, _ = get_bow_vectors(test['review'].values, vectorizer=bi_vectorizer) uni_train_X, uni_vectorizer = get_bow_vectors(train['review'].values, min_count=4, max_frac=0.5, frequency=False, bigrams=False) uni_test_X, _ = get_bow_vectors(test['review'].values, vectorizer=uni_vectorizer) svm1 = build_svm_classifier(uni_train_X, train_Y, kernel=kernel, gamma='scale') svm2 = build_svm_classifier(np.concatenate((uni_train_X, bi_train_X), axis=1), train_Y, kernel=kernel, gamma='scale') return permutation_test( test_Y, svm1.predict(uni_test_X), svm2.predict(np.concatenate((uni_test_X, bi_test_X), axis=1)))
def run_permutation_test_bow_with_bigrams(train, test, kernel): """ Run permutation test of difference between using only bigrmas and only unigrams. """ train_Y = train['sentiment'].to_numpy() test_Y = test['sentiment'].to_numpy() bi_train_X, bi_vectorizer = get_bow_vectors(train['review'].values, min_count=7, max_frac=0.5, frequency=False, bigrams=True) bi_test_X, _ = get_bow_vectors(test['review'].values, vectorizer=bi_vectorizer) uni_train_X, uni_vectorizer = get_bow_vectors(train['review'].values, min_count=4, max_frac=0.5, frequency=False) uni_test_X, _ = get_bow_vectors(test['review'].values, vectorizer=uni_vectorizer) svm1 = build_svm_classifier(bi_train_X, train_Y, kernel=kernel, gamma='scale') svm2 = build_svm_classifier(uni_train_X, train_Y, kernel=kernel, gamma='scale') return permutation_test(test_Y, svm1.predict(bi_test_X), svm2.predict(uni_test_X))
def run_permutation_test_bow_feature_presence_vs_frequency( train, test, kernel): train_Y = train['sentiment'].to_numpy() test_Y = test['sentiment'].to_numpy() X_pres, pres_vectorizer = get_bow_vectors(train['review'].values, min_count=4, max_frac=0.5, frequency=False) X_pres_test, _ = get_bow_vectors(test['review'].values, vectorizer=pres_vectorizer) X, vectorizer = get_bow_vectors(train['review'].values, min_count=4, max_frac=0.5) X_test, _ = get_bow_vectors(test['review'].values, vectorizer=vectorizer) svm1 = build_svm_classifier(X_pres, train_Y, kernel=kernel, gamma='scale') svm2 = build_svm_classifier(X, train_Y, kernel=kernel, gamma='scale') return permutation_test(test_Y, svm1.predict(X_pres_test), svm2.predict(X_test))
def get_bow_data(review_data, train_frac=0.7, min_count=10, max_frac=0.5, dim=100): """ Get BOW vector training and test sets. """ X = get_bow_vectors(review_data['review'].values, min_count, max_frac) Y = review_data['sentiment'].to_numpy() print('Created a BOW vector of shape', X.shape) return get_train_test_split(0.7, X, Y)
def run_permutation_test_two_bow_kernels(train, test, kernel1, kernel2): """ Run permutation test of difference between two kernels when using BOW vectors. """ train_Y = train['sentiment'].to_numpy() test_Y = test['sentiment'].to_numpy() bow_train_X, vectorizer = get_bow_vectors(train['review'].values, min_count=4, max_frac=0.5, frequency=False) bow_test_X, _ = get_bow_vectors(test['review'].values, vectorizer=vectorizer) svm1 = build_svm_classifier(bow_train_X, train_Y, kernel=kernel1, gamma='scale') svm2 = build_svm_classifier(bow_train_X, train_Y, kernel=kernel2, gamma='scale') return permutation_test(test_Y, svm1.predict(bow_test_X), svm2.predict(bow_test_X))
def deployment_test(imdb_data_folder, doc2vec_model, doc2vec_svm, uni_vectorizer, bi_vectorizer, uni_bi_bow_svm): print('Fetching new reviews') imdb_data_folder = 'aclImdb' imdb_sentiments = ['pos', 'neg'] # get new imdb reviews new_reviews = get_reviews(imdb_data_folder, imdb_sentiments, ['new']) new_test_Y = new_reviews['sentiment'] doc2vec_test_new = get_doc2vec_data(new_reviews['review'].values, doc2vec_model) doc2vec_acc_new = estimate_svm_accuracy(doc2vec_test_new, new_test_Y, doc2vec_svm) print('Doc2Vec acc with new data', doc2vec_acc_new) new_test_X, _ = get_bow_vectors(new_reviews['review'].values, min_count=4, max_frac=0.5, frequency=False, vectorizer=uni_vectorizer) new_bi_test_X, _ = get_bow_vectors(new_reviews['review'].values, min_count=7, max_frac=0.5, frequency=False, vectorizer=bi_vectorizer) new_conc_test_X = np.concatenate((new_test_X, new_bi_test_X), axis=1) bow_acc_new = estimate_svm_accuracy(new_conc_test_X, new_test_Y, uni_bi_bow_svm) print('BOW acc with new data', bow_acc_new) for i in range(len(new_reviews)): print('i:', i) print('review') print(new_reviews['review'].iloc[i]) print('bow prediction', uni_bi_bow_svm.predict(new_conc_test_X[i].reshape(1, -1))) print('doc2vec prediction', doc2vec_svm.predict(doc2vec_test_new[i].reshape(1, -1))) print('correct label', new_reviews['sentiment'].iloc[i]) print('-------------')
def main(): np.random.seed(123) imdb_data_folder = 'aclImdb' imdb_sentiments = ['pos', 'neg'] subfolders = ['train', 'test'] # get imdb reviews to train doc2vec with imdb_reviews = get_reviews(imdb_data_folder, imdb_sentiments, subfolders) reviews, _ = get_uni_and_bi_grams('data-tagged') review_data = build_data(reviews) # set a blind set aside for reporting results development_data, blind_test_set = get_train_test_split(0.9, review_data) test_Y = blind_test_set['sentiment'].values #find_optimal_doc2vec_hyperparams(imdb_reviews, development_data) ####### TRAINING MODELS AND RUNNING EXPERIMENTS ############# doc2vec_train_X, doc2vec_train_Y, doc2vec_model = train_doc2vec_model( imdb_reviews, epochs=30, window_size=4, dm=0, dbow_words=1, pretrained=True, save=True) doc2vec_test_X = get_doc2vec_data(blind_test_set['review'].values, doc2vec_model) doc2vec_svm = build_svm_classifier(doc2vec_train_X, doc2vec_train_Y, kernel='rbf', gamma='scale') concat_doc2vec_train_X, concat_doc2vec_train_Y, concat_doc2vec_model = train_doc2vec_model( imdb_reviews, epochs=30, window_size=4, dm=1, dm_concat=1, pretrained=True, save=True) dm_doc2vec_train_X, dm_doc2vec_train_Y, dm_doc2vec_model = train_doc2vec_model( imdb_reviews, epochs=30, window_size=4, dm=1, pretrained=True, save=True) print('-----------') doc2vec_train_Xs = [ doc2vec_train_X, dm_doc2vec_train_X, concat_doc2vec_train_X ] doc2vec_models = [doc2vec_model, dm_doc2vec_model, concat_doc2vec_model] get_cross_validated_baseline_accuracies(development_data, doc2vec_Xs=doc2vec_train_Xs, doc2vec_Y=doc2vec_train_Y, doc2vec_models=doc2vec_models) print('-----------') cross_validate_permutation_tests(development_data, imdb_reviews, doc2vec_train_Xs, doc2vec_train_Y, doc2vec_models) ###################### USING THE BEST MODELS ################# print('---------Accuracies using the best models---------') doc2vec_acc = estimate_svm_accuracy(doc2vec_test_X, test_Y, doc2vec_svm) print('doc2vec accuracy', doc2vec_acc) X_pres, uni_vectorizer = get_bow_vectors(development_data['review'].values, min_count=4, max_frac=0.5, frequency=False) X_bi, bi_vectorizer = get_bow_vectors(development_data['review'].values, min_count=7, max_frac=0.5, frequency=False, bigrams=True) test_X, _ = get_bow_vectors(blind_test_set['review'].values, min_count=4, max_frac=0.5, frequency=False, vectorizer=uni_vectorizer) bi_test_X, _ = get_bow_vectors(blind_test_set['review'].values, min_count=7, max_frac=0.5, frequency=False, vectorizer=bi_vectorizer) conc_test_X = np.concatenate((test_X, bi_test_X), axis=1) uni_bi_bow_svm = build_svm_classifier(np.concatenate((X_pres, X_bi), axis=1), development_data['sentiment'].values, kernel='linear', probability=True) bow_acc = estimate_svm_accuracy(conc_test_X, test_Y, uni_bi_bow_svm) print( 'bow accuracy with presence and both uni and bigrams and a linear kernel', bow_acc) bow_svm = build_svm_classifier(X_pres, development_data['sentiment'].values, kernel='linear') bow_acc2 = estimate_svm_accuracy(test_X, test_Y, bow_svm) print('bow accuracy with presence, unigrams and a linear kernel', bow_acc2) print('-------------') print('Error analysis for Doc2Vec') model_error_analysis(doc2vec_test_X, blind_test_set, doc2vec_svm) evaluate_vector_qualities(doc2vec_test_X, test_Y) print('Error analysis for BOW with both unigrams and bigrams') model_error_analysis(conc_test_X, blind_test_set, uni_bi_bow_svm) evaluate_vector_qualities(conc_test_X, test_Y) print('----------------') print('Vector quality estimation') evaluate_vector_qualities(test_X, test_Y, model_imdb) do_test_visualisations(blind_test_set, conc_test_X, test_Y, doc2vec_test_X, doc2vec_model) print('----------------') print('Plotting emotion sentences for intensification analysis') run_intensification_analysis(doc2vec_model) print('Running a deployment test') deployment_test(imdb_data_folder, doc2vec_model, doc2vec_svm, uni_vectorizer, bi_vectorizer, uni_bi_bow_svm)
def get_cross_validated_baseline_accuracies(development_data, doc2vec_Xs, doc2vec_Y, doc2vec_models): ############# BOW accuracies ############## test_Y = development_data['sentiment'].values X, _ = get_bow_vectors(development_data['review'].values, min_count=4, max_frac=0.5) X_pres, _ = get_bow_vectors(development_data['review'].values, min_count=4, max_frac=0.5, frequency=False) X_low, _ = get_bow_vectors(development_data['review'].values, min_count=4, max_frac=0.5, lowercase=False, frequency=False) X_bi, _ = get_bow_vectors(development_data['review'].values, min_count=7, max_frac=0.5, frequency=False, bigrams=True) Y = development_data['sentiment'].to_numpy() acc6 = cross_validate_svm(X_bi, Y, kernel='linear', gamma='scale') print( 'Cross validated bow accuracy when using a linear kernel, feature presence and bigrams', acc6) acc7 = cross_validate_svm(np.concatenate((X_bi, X_pres), axis=1), Y, kernel='linear', gamma='scale') print( 'Cross validated bow accuracy when using a linear kernel, feature presence and both unigrams and bigrams', acc7) acc5 = cross_validate_svm(X_low, Y, kernel='linear', gamma='scale') print( 'Cross validated bow accuracy when using a linear kernel, feature presence and lowercased input', acc5) acc4 = cross_validate_svm(X_pres, Y, kernel='rbf', gamma='scale') print( 'Cross validated bow accuracy when using a gaussian kernel and feature presence', acc4) acc3 = cross_validate_svm(X_pres, Y, kernel='linear', gamma='scale') print( 'Cross validated bow accuracy when using a linear kernel and feature presence', acc3) acc1 = cross_validate_svm(X, Y, kernel='rbf', gamma='scale') print('Cross validated bow accuracy when using a gaussian kernel', acc1) acc2 = cross_validate_svm(X, Y, kernel='linear') print('Cross validated bow accuracy when using a linear kernel', acc2) ############# Doc2Vec accuracies ############# svm3 = build_svm_classifier(doc2vec_Xs[2], doc2vec_Y, kernel='rbf', gamma='scale') test_X3 = get_doc2vec_data(development_data['review'].values, doc2vec_models[2]) accuracy3 = estimate_svm_accuracy(test_X3, test_Y, svm3) print('Doc2Vec accuracy with a gaussian kernel and dm concat vectors', accuracy3) svm4 = build_svm_classifier(doc2vec_Xs[2], doc2vec_Y, kernel='linear', gamma='scale') test_X4 = get_doc2vec_data(development_data['review'].values, doc2vec_models[2]) test_Y = development_data['sentiment'].values accuracy4 = estimate_svm_accuracy(test_X4, test_Y, svm4) print('Doc2Vec accuracy with a linear kernel and dm concat vectors', accuracy4) svm = build_svm_classifier(doc2vec_Xs[0], doc2vec_Y, kernel='rbf', gamma='scale') test_X = get_doc2vec_data(development_data['review'].values, doc2vec_models[0]) accuracy = estimate_svm_accuracy(test_X, test_Y, svm) print('Doc2Vec accuracy with a gaussian kernel and dbow', accuracy) svm = build_svm_classifier(doc2vec_Xs[0], doc2vec_Y, kernel='linear', gamma='scale') test_X = get_doc2vec_data(development_data['review'].values, doc2vec_models[0]) accuracy = estimate_svm_accuracy(test_X, test_Y, svm) print('Doc2Vec accuracy with a linear kernel and dbow', accuracy) test_X2 = get_doc2vec_data(development_data['review'].values, doc2vec_models[1]) test_concat_X = np.concatenate((test_X, test_X2), axis=1) svm2 = build_svm_classifier(np.concatenate((doc2vec_Xs[0], doc2vec_Xs[1]), axis=1), doc2vec_Y, kernel='rbf', gamma='scale') accuracy2 = estimate_svm_accuracy(test_concat_X, test_Y, svm2) print('Doc2Vec accuracy with concatenated vectors and gaussian kernel', accuracy2) svm1 = build_svm_classifier(doc2vec_Xs[1], doc2vec_Y, kernel='linear', gamma='scale') test_X = get_doc2vec_data(development_data['review'].values, doc2vec_models[1]) accuracy1 = estimate_svm_accuracy(test_X, test_Y, svm1) print('Doc2Vec accuracy with a linear kernel and dm', accuracy1) svm3 = build_svm_classifier(doc2vec_Xs[1], doc2vec_Y, kernel='rbf', gamma='scale') test_X = get_doc2vec_data(development_data['review'].values, doc2vec_models[1]) accuracy3 = estimate_svm_accuracy(test_X, test_Y, svm3) print('Doc2Vec accuracy with a gaussian kernel and dm', accuracy3) test_X2 = get_doc2vec_data(development_data['review'].values, doc2vec_models[1]) test_concat_X = np.concatenate((test_X, test_X2), axis=1) svm4 = build_svm_classifier(np.concatenate((doc2vec_Xs[0], doc2vec_Xs[1]), axis=1), doc2vec_Y, kernel='linear', gamma='scale') accuracy4 = estimate_svm_accuracy(test_concat_X, test_Y, svm4) print('Doc2Vec accuracy with concatenated vectors and linear kernel', accuracy4)