def run_permutation_test_bow_vs_doc2vec(train, test, bow_kernel, doc2vec_kernel, doc2vec_train_X, doc2vec_train_Y, doc2vec_model): """ Run a permutation test to compare bow and doc2vec svms. """ # prepare data for both svms train_Y = train['sentiment'].to_numpy() test_Y = test['sentiment'].to_numpy() bow_train_X, vectorizer = get_bow_vectors(train['review'].values, min_count=4, max_frac=0.5, frequency=False) bow_test_X, _ = get_bow_vectors(test['review'].values, vectorizer=vectorizer) doc2vec_test_X = get_doc2vec_data(test['review'].values, doc2vec_model) # build models and predict bow_svm = build_svm_classifier(bow_train_X, train_Y, kernel=bow_kernel, gamma='scale') doc2vec_svm = build_svm_classifier(doc2vec_train_X, doc2vec_train_Y, kernel=doc2vec_kernel, gamma='scale') # delete big variables no longer used del bow_train_X return permutation_test(test_Y, bow_svm.predict(bow_test_X), doc2vec_svm.predict(doc2vec_test_X))
def run_permutation_test_bow_lowercase(train, test, kernel): train_Y = train['sentiment'].to_numpy() test_Y = test['sentiment'].to_numpy() bow_train_X, vectorizer = get_bow_vectors(train['review'].values, min_count=4, max_frac=0.5, frequency=False) bow_test_X, _ = get_bow_vectors(test['review'].values, vectorizer=vectorizer) low_bow_train_X, low_vectorizer = get_bow_vectors(train['review'].values, min_count=4, max_frac=0.5, lowercase=False, frequency=False) low_bow_test_X, _ = get_bow_vectors(test['review'].values, vectorizer=low_vectorizer) svm1 = build_svm_classifier(bow_train_X, train_Y, kernel=kernel, gamma='scale') svm2 = build_svm_classifier(low_bow_train_X, train_Y, kernel=kernel, gamma='scale') return permutation_test(test_Y, svm1.predict(bow_test_X), svm2.predict(low_bow_test_X))
def run_permutation_test_bow_with_uni_and_bigrams(train, test, kernel): train_Y = train['sentiment'].to_numpy() test_Y = test['sentiment'].to_numpy() bi_train_X, bi_vectorizer = get_bow_vectors(train['review'].values, min_count=7, max_frac=0.5, frequency=False, bigrams=True) bi_test_X, _ = get_bow_vectors(test['review'].values, vectorizer=bi_vectorizer) uni_train_X, uni_vectorizer = get_bow_vectors(train['review'].values, min_count=4, max_frac=0.5, frequency=False, bigrams=False) uni_test_X, _ = get_bow_vectors(test['review'].values, vectorizer=uni_vectorizer) svm1 = build_svm_classifier(uni_train_X, train_Y, kernel=kernel, gamma='scale') svm2 = build_svm_classifier(np.concatenate((uni_train_X, bi_train_X), axis=1), train_Y, kernel=kernel, gamma='scale') return permutation_test( test_Y, svm1.predict(uni_test_X), svm2.predict(np.concatenate((uni_test_X, bi_test_X), axis=1)))
def run_permutation_test_bow_with_bigrams(train, test, kernel): """ Run permutation test of difference between using only bigrmas and only unigrams. """ train_Y = train['sentiment'].to_numpy() test_Y = test['sentiment'].to_numpy() bi_train_X, bi_vectorizer = get_bow_vectors(train['review'].values, min_count=7, max_frac=0.5, frequency=False, bigrams=True) bi_test_X, _ = get_bow_vectors(test['review'].values, vectorizer=bi_vectorizer) uni_train_X, uni_vectorizer = get_bow_vectors(train['review'].values, min_count=4, max_frac=0.5, frequency=False) uni_test_X, _ = get_bow_vectors(test['review'].values, vectorizer=uni_vectorizer) svm1 = build_svm_classifier(bi_train_X, train_Y, kernel=kernel, gamma='scale') svm2 = build_svm_classifier(uni_train_X, train_Y, kernel=kernel, gamma='scale') return permutation_test(test_Y, svm1.predict(bi_test_X), svm2.predict(uni_test_X))
def run_permutation_test_two_different_doc2vecs(train, test, kernel1, kernel2, model1, model2, d2v_X_1, d2v_X_2, d2v_Y1, d2v_Y2): """ Run a permutation test to compare bow and doc2vec svms. """ # prepare test data for both svms test_X1 = get_doc2vec_data(test['review'].values, model1) test_X2 = get_doc2vec_data(test['review'].values, model2) # build models and predict svm1 = build_svm_classifier(d2v_X_1, d2v_Y1, kernel=kernel1, gamma='scale') svm2 = build_svm_classifier(d2v_X_2, d2v_Y2, kernel=kernel2, gamma='scale') test_Y = test['sentiment'].to_numpy() return permutation_test(test_Y, svm1.predict(test_X1), svm2.predict(test_X2))
def run_permutation_test_two_doc2vec_kernels(train, test, kernel1, kernel2, model, d2v_X, d2v_Y, **kwargs): """ Run a permutation test to compare bow and doc2vec svms. """ # prepare test data test_X = get_doc2vec_data(test['review'].values, model) # build models and predict svm1 = build_svm_classifier(d2v_X, d2v_Y, kernel=kernel1, gamma='scale', **kwargs) svm2 = build_svm_classifier(d2v_X, d2v_Y, kernel=kernel2, gamma='scale') test_Y = test['sentiment'].to_numpy() return permutation_test(test_Y, svm1.predict(test_X), svm2.predict(test_X))
def run_permutation_concatenated_vs_simple_doc2vec(train, test, kernel1, kernel2, model1, model2, d2v_X_1, d2v_X_2, d2v_Y): """ Run a permutation test to compare bow and doc2vec svms. """ # prepare test data for both svms test_X1 = get_doc2vec_data(test['review'].values, model1) test_X2 = get_doc2vec_data(test['review'].values, model2) test_concat_X = np.concatenate((test_X1, test_X2), axis=1) # build models and predict svm1 = build_svm_classifier(d2v_X_1, d2v_Y, kernel=kernel1, gamma='scale') svm2 = build_svm_classifier(np.concatenate((d2v_X_1, d2v_X_2), axis=1), d2v_Y, kernel=kernel2, gamma='scale') test_Y = test['sentiment'].to_numpy() return permutation_test(test_Y, svm1.predict(test_X1), svm2.predict(test_concat_X))
def run_permutation_test_bow_feature_presence_vs_frequency( train, test, kernel): train_Y = train['sentiment'].to_numpy() test_Y = test['sentiment'].to_numpy() X_pres, pres_vectorizer = get_bow_vectors(train['review'].values, min_count=4, max_frac=0.5, frequency=False) X_pres_test, _ = get_bow_vectors(test['review'].values, vectorizer=pres_vectorizer) X, vectorizer = get_bow_vectors(train['review'].values, min_count=4, max_frac=0.5) X_test, _ = get_bow_vectors(test['review'].values, vectorizer=vectorizer) svm1 = build_svm_classifier(X_pres, train_Y, kernel=kernel, gamma='scale') svm2 = build_svm_classifier(X, train_Y, kernel=kernel, gamma='scale') return permutation_test(test_Y, svm1.predict(X_pres_test), svm2.predict(X_test))
def run_permutation_test_two_bow_kernels(train, test, kernel1, kernel2): """ Run permutation test of difference between two kernels when using BOW vectors. """ train_Y = train['sentiment'].to_numpy() test_Y = test['sentiment'].to_numpy() bow_train_X, vectorizer = get_bow_vectors(train['review'].values, min_count=4, max_frac=0.5, frequency=False) bow_test_X, _ = get_bow_vectors(test['review'].values, vectorizer=vectorizer) svm1 = build_svm_classifier(bow_train_X, train_Y, kernel=kernel1, gamma='scale') svm2 = build_svm_classifier(bow_train_X, train_Y, kernel=kernel2, gamma='scale') return permutation_test(test_Y, svm1.predict(bow_test_X), svm2.predict(bow_test_X))