Esempio n. 1
0
def run_permutation_test_bow_vs_doc2vec(train, test, bow_kernel,
                                        doc2vec_kernel, doc2vec_train_X,
                                        doc2vec_train_Y, doc2vec_model):
    """ Run a permutation test to compare bow and doc2vec svms. """
    # prepare data for both svms
    train_Y = train['sentiment'].to_numpy()
    test_Y = test['sentiment'].to_numpy()
    bow_train_X, vectorizer = get_bow_vectors(train['review'].values,
                                              min_count=4,
                                              max_frac=0.5,
                                              frequency=False)
    bow_test_X, _ = get_bow_vectors(test['review'].values,
                                    vectorizer=vectorizer)
    doc2vec_test_X = get_doc2vec_data(test['review'].values, doc2vec_model)
    # build models and predict
    bow_svm = build_svm_classifier(bow_train_X,
                                   train_Y,
                                   kernel=bow_kernel,
                                   gamma='scale')
    doc2vec_svm = build_svm_classifier(doc2vec_train_X,
                                       doc2vec_train_Y,
                                       kernel=doc2vec_kernel,
                                       gamma='scale')
    # delete big variables no longer used
    del bow_train_X
    return permutation_test(test_Y, bow_svm.predict(bow_test_X),
                            doc2vec_svm.predict(doc2vec_test_X))
Esempio n. 2
0
def run_permutation_test_bow_lowercase(train, test, kernel):
    train_Y = train['sentiment'].to_numpy()
    test_Y = test['sentiment'].to_numpy()
    bow_train_X, vectorizer = get_bow_vectors(train['review'].values,
                                              min_count=4,
                                              max_frac=0.5,
                                              frequency=False)
    bow_test_X, _ = get_bow_vectors(test['review'].values,
                                    vectorizer=vectorizer)
    low_bow_train_X, low_vectorizer = get_bow_vectors(train['review'].values,
                                                      min_count=4,
                                                      max_frac=0.5,
                                                      lowercase=False,
                                                      frequency=False)
    low_bow_test_X, _ = get_bow_vectors(test['review'].values,
                                        vectorizer=low_vectorizer)
    svm1 = build_svm_classifier(bow_train_X,
                                train_Y,
                                kernel=kernel,
                                gamma='scale')
    svm2 = build_svm_classifier(low_bow_train_X,
                                train_Y,
                                kernel=kernel,
                                gamma='scale')
    return permutation_test(test_Y, svm1.predict(bow_test_X),
                            svm2.predict(low_bow_test_X))
Esempio n. 3
0
def run_permutation_test_bow_with_uni_and_bigrams(train, test, kernel):
    train_Y = train['sentiment'].to_numpy()
    test_Y = test['sentiment'].to_numpy()
    bi_train_X, bi_vectorizer = get_bow_vectors(train['review'].values,
                                                min_count=7,
                                                max_frac=0.5,
                                                frequency=False,
                                                bigrams=True)
    bi_test_X, _ = get_bow_vectors(test['review'].values,
                                   vectorizer=bi_vectorizer)
    uni_train_X, uni_vectorizer = get_bow_vectors(train['review'].values,
                                                  min_count=4,
                                                  max_frac=0.5,
                                                  frequency=False,
                                                  bigrams=False)
    uni_test_X, _ = get_bow_vectors(test['review'].values,
                                    vectorizer=uni_vectorizer)
    svm1 = build_svm_classifier(uni_train_X,
                                train_Y,
                                kernel=kernel,
                                gamma='scale')
    svm2 = build_svm_classifier(np.concatenate((uni_train_X, bi_train_X),
                                               axis=1),
                                train_Y,
                                kernel=kernel,
                                gamma='scale')
    return permutation_test(
        test_Y, svm1.predict(uni_test_X),
        svm2.predict(np.concatenate((uni_test_X, bi_test_X), axis=1)))
Esempio n. 4
0
def run_permutation_test_bow_with_bigrams(train, test, kernel):
    """ Run permutation test of difference between using only bigrmas and only unigrams. """
    train_Y = train['sentiment'].to_numpy()
    test_Y = test['sentiment'].to_numpy()
    bi_train_X, bi_vectorizer = get_bow_vectors(train['review'].values,
                                                min_count=7,
                                                max_frac=0.5,
                                                frequency=False,
                                                bigrams=True)
    bi_test_X, _ = get_bow_vectors(test['review'].values,
                                   vectorizer=bi_vectorizer)
    uni_train_X, uni_vectorizer = get_bow_vectors(train['review'].values,
                                                  min_count=4,
                                                  max_frac=0.5,
                                                  frequency=False)
    uni_test_X, _ = get_bow_vectors(test['review'].values,
                                    vectorizer=uni_vectorizer)
    svm1 = build_svm_classifier(bi_train_X,
                                train_Y,
                                kernel=kernel,
                                gamma='scale')
    svm2 = build_svm_classifier(uni_train_X,
                                train_Y,
                                kernel=kernel,
                                gamma='scale')
    return permutation_test(test_Y, svm1.predict(bi_test_X),
                            svm2.predict(uni_test_X))
Esempio n. 5
0
def run_permutation_test_two_different_doc2vecs(train, test, kernel1, kernel2,
                                                model1, model2, d2v_X_1,
                                                d2v_X_2, d2v_Y1, d2v_Y2):
    """ Run a permutation test to compare bow and doc2vec svms. """
    # prepare test data for both svms
    test_X1 = get_doc2vec_data(test['review'].values, model1)
    test_X2 = get_doc2vec_data(test['review'].values, model2)
    # build models and predict
    svm1 = build_svm_classifier(d2v_X_1, d2v_Y1, kernel=kernel1, gamma='scale')
    svm2 = build_svm_classifier(d2v_X_2, d2v_Y2, kernel=kernel2, gamma='scale')
    test_Y = test['sentiment'].to_numpy()
    return permutation_test(test_Y, svm1.predict(test_X1),
                            svm2.predict(test_X2))
Esempio n. 6
0
def run_permutation_test_two_doc2vec_kernels(train, test, kernel1, kernel2,
                                             model, d2v_X, d2v_Y, **kwargs):
    """ Run a permutation test to compare bow and doc2vec svms. """
    # prepare test data
    test_X = get_doc2vec_data(test['review'].values, model)
    # build models and predict
    svm1 = build_svm_classifier(d2v_X,
                                d2v_Y,
                                kernel=kernel1,
                                gamma='scale',
                                **kwargs)
    svm2 = build_svm_classifier(d2v_X, d2v_Y, kernel=kernel2, gamma='scale')
    test_Y = test['sentiment'].to_numpy()
    return permutation_test(test_Y, svm1.predict(test_X), svm2.predict(test_X))
Esempio n. 7
0
def run_permutation_concatenated_vs_simple_doc2vec(train, test, kernel1,
                                                   kernel2, model1, model2,
                                                   d2v_X_1, d2v_X_2, d2v_Y):
    """ Run a permutation test to compare bow and doc2vec svms. """
    # prepare test data for both svms
    test_X1 = get_doc2vec_data(test['review'].values, model1)
    test_X2 = get_doc2vec_data(test['review'].values, model2)
    test_concat_X = np.concatenate((test_X1, test_X2), axis=1)
    # build models and predict
    svm1 = build_svm_classifier(d2v_X_1, d2v_Y, kernel=kernel1, gamma='scale')
    svm2 = build_svm_classifier(np.concatenate((d2v_X_1, d2v_X_2), axis=1),
                                d2v_Y,
                                kernel=kernel2,
                                gamma='scale')
    test_Y = test['sentiment'].to_numpy()
    return permutation_test(test_Y, svm1.predict(test_X1),
                            svm2.predict(test_concat_X))
Esempio n. 8
0
def run_permutation_test_bow_feature_presence_vs_frequency(
        train, test, kernel):
    train_Y = train['sentiment'].to_numpy()
    test_Y = test['sentiment'].to_numpy()
    X_pres, pres_vectorizer = get_bow_vectors(train['review'].values,
                                              min_count=4,
                                              max_frac=0.5,
                                              frequency=False)
    X_pres_test, _ = get_bow_vectors(test['review'].values,
                                     vectorizer=pres_vectorizer)
    X, vectorizer = get_bow_vectors(train['review'].values,
                                    min_count=4,
                                    max_frac=0.5)
    X_test, _ = get_bow_vectors(test['review'].values, vectorizer=vectorizer)
    svm1 = build_svm_classifier(X_pres, train_Y, kernel=kernel, gamma='scale')
    svm2 = build_svm_classifier(X, train_Y, kernel=kernel, gamma='scale')
    return permutation_test(test_Y, svm1.predict(X_pres_test),
                            svm2.predict(X_test))
Esempio n. 9
0
def run_permutation_test_two_bow_kernels(train, test, kernel1, kernel2):
    """ Run permutation test of difference between two kernels when using BOW vectors. """
    train_Y = train['sentiment'].to_numpy()
    test_Y = test['sentiment'].to_numpy()
    bow_train_X, vectorizer = get_bow_vectors(train['review'].values,
                                              min_count=4,
                                              max_frac=0.5,
                                              frequency=False)
    bow_test_X, _ = get_bow_vectors(test['review'].values,
                                    vectorizer=vectorizer)
    svm1 = build_svm_classifier(bow_train_X,
                                train_Y,
                                kernel=kernel1,
                                gamma='scale')
    svm2 = build_svm_classifier(bow_train_X,
                                train_Y,
                                kernel=kernel2,
                                gamma='scale')
    return permutation_test(test_Y, svm1.predict(bow_test_X),
                            svm2.predict(bow_test_X))