コード例 #1
0
def run_permutation_test_bow_lowercase(train, test, kernel):
    train_Y = train['sentiment'].to_numpy()
    test_Y = test['sentiment'].to_numpy()
    bow_train_X, vectorizer = get_bow_vectors(train['review'].values,
                                              min_count=4,
                                              max_frac=0.5,
                                              frequency=False)
    bow_test_X, _ = get_bow_vectors(test['review'].values,
                                    vectorizer=vectorizer)
    low_bow_train_X, low_vectorizer = get_bow_vectors(train['review'].values,
                                                      min_count=4,
                                                      max_frac=0.5,
                                                      lowercase=False,
                                                      frequency=False)
    low_bow_test_X, _ = get_bow_vectors(test['review'].values,
                                        vectorizer=low_vectorizer)
    svm1 = build_svm_classifier(bow_train_X,
                                train_Y,
                                kernel=kernel,
                                gamma='scale')
    svm2 = build_svm_classifier(low_bow_train_X,
                                train_Y,
                                kernel=kernel,
                                gamma='scale')
    return permutation_test(test_Y, svm1.predict(bow_test_X),
                            svm2.predict(low_bow_test_X))
コード例 #2
0
def run_permutation_test_bow_vs_doc2vec(train, test, bow_kernel,
                                        doc2vec_kernel, doc2vec_train_X,
                                        doc2vec_train_Y, doc2vec_model):
    """ Run a permutation test to compare bow and doc2vec svms. """
    # prepare data for both svms
    train_Y = train['sentiment'].to_numpy()
    test_Y = test['sentiment'].to_numpy()
    bow_train_X, vectorizer = get_bow_vectors(train['review'].values,
                                              min_count=4,
                                              max_frac=0.5,
                                              frequency=False)
    bow_test_X, _ = get_bow_vectors(test['review'].values,
                                    vectorizer=vectorizer)
    doc2vec_test_X = get_doc2vec_data(test['review'].values, doc2vec_model)
    # build models and predict
    bow_svm = build_svm_classifier(bow_train_X,
                                   train_Y,
                                   kernel=bow_kernel,
                                   gamma='scale')
    doc2vec_svm = build_svm_classifier(doc2vec_train_X,
                                       doc2vec_train_Y,
                                       kernel=doc2vec_kernel,
                                       gamma='scale')
    # delete big variables no longer used
    del bow_train_X
    return permutation_test(test_Y, bow_svm.predict(bow_test_X),
                            doc2vec_svm.predict(doc2vec_test_X))
コード例 #3
0
def run_permutation_test_bow_with_uni_and_bigrams(train, test, kernel):
    train_Y = train['sentiment'].to_numpy()
    test_Y = test['sentiment'].to_numpy()
    bi_train_X, bi_vectorizer = get_bow_vectors(train['review'].values,
                                                min_count=7,
                                                max_frac=0.5,
                                                frequency=False,
                                                bigrams=True)
    bi_test_X, _ = get_bow_vectors(test['review'].values,
                                   vectorizer=bi_vectorizer)
    uni_train_X, uni_vectorizer = get_bow_vectors(train['review'].values,
                                                  min_count=4,
                                                  max_frac=0.5,
                                                  frequency=False,
                                                  bigrams=False)
    uni_test_X, _ = get_bow_vectors(test['review'].values,
                                    vectorizer=uni_vectorizer)
    svm1 = build_svm_classifier(uni_train_X,
                                train_Y,
                                kernel=kernel,
                                gamma='scale')
    svm2 = build_svm_classifier(np.concatenate((uni_train_X, bi_train_X),
                                               axis=1),
                                train_Y,
                                kernel=kernel,
                                gamma='scale')
    return permutation_test(
        test_Y, svm1.predict(uni_test_X),
        svm2.predict(np.concatenate((uni_test_X, bi_test_X), axis=1)))
コード例 #4
0
def run_permutation_test_bow_with_bigrams(train, test, kernel):
    """ Run permutation test of difference between using only bigrmas and only unigrams. """
    train_Y = train['sentiment'].to_numpy()
    test_Y = test['sentiment'].to_numpy()
    bi_train_X, bi_vectorizer = get_bow_vectors(train['review'].values,
                                                min_count=7,
                                                max_frac=0.5,
                                                frequency=False,
                                                bigrams=True)
    bi_test_X, _ = get_bow_vectors(test['review'].values,
                                   vectorizer=bi_vectorizer)
    uni_train_X, uni_vectorizer = get_bow_vectors(train['review'].values,
                                                  min_count=4,
                                                  max_frac=0.5,
                                                  frequency=False)
    uni_test_X, _ = get_bow_vectors(test['review'].values,
                                    vectorizer=uni_vectorizer)
    svm1 = build_svm_classifier(bi_train_X,
                                train_Y,
                                kernel=kernel,
                                gamma='scale')
    svm2 = build_svm_classifier(uni_train_X,
                                train_Y,
                                kernel=kernel,
                                gamma='scale')
    return permutation_test(test_Y, svm1.predict(bi_test_X),
                            svm2.predict(uni_test_X))
コード例 #5
0
def run_permutation_test_bow_feature_presence_vs_frequency(
        train, test, kernel):
    train_Y = train['sentiment'].to_numpy()
    test_Y = test['sentiment'].to_numpy()
    X_pres, pres_vectorizer = get_bow_vectors(train['review'].values,
                                              min_count=4,
                                              max_frac=0.5,
                                              frequency=False)
    X_pres_test, _ = get_bow_vectors(test['review'].values,
                                     vectorizer=pres_vectorizer)
    X, vectorizer = get_bow_vectors(train['review'].values,
                                    min_count=4,
                                    max_frac=0.5)
    X_test, _ = get_bow_vectors(test['review'].values, vectorizer=vectorizer)
    svm1 = build_svm_classifier(X_pres, train_Y, kernel=kernel, gamma='scale')
    svm2 = build_svm_classifier(X, train_Y, kernel=kernel, gamma='scale')
    return permutation_test(test_Y, svm1.predict(X_pres_test),
                            svm2.predict(X_test))
コード例 #6
0
def get_bow_data(review_data,
                 train_frac=0.7,
                 min_count=10,
                 max_frac=0.5,
                 dim=100):
    """ Get BOW vector training and test sets. """
    X = get_bow_vectors(review_data['review'].values, min_count, max_frac)
    Y = review_data['sentiment'].to_numpy()
    print('Created a BOW vector of shape', X.shape)
    return get_train_test_split(0.7, X, Y)
コード例 #7
0
def run_permutation_test_two_bow_kernels(train, test, kernel1, kernel2):
    """ Run permutation test of difference between two kernels when using BOW vectors. """
    train_Y = train['sentiment'].to_numpy()
    test_Y = test['sentiment'].to_numpy()
    bow_train_X, vectorizer = get_bow_vectors(train['review'].values,
                                              min_count=4,
                                              max_frac=0.5,
                                              frequency=False)
    bow_test_X, _ = get_bow_vectors(test['review'].values,
                                    vectorizer=vectorizer)
    svm1 = build_svm_classifier(bow_train_X,
                                train_Y,
                                kernel=kernel1,
                                gamma='scale')
    svm2 = build_svm_classifier(bow_train_X,
                                train_Y,
                                kernel=kernel2,
                                gamma='scale')
    return permutation_test(test_Y, svm1.predict(bow_test_X),
                            svm2.predict(bow_test_X))
コード例 #8
0
def deployment_test(imdb_data_folder, doc2vec_model, doc2vec_svm,
                    uni_vectorizer, bi_vectorizer, uni_bi_bow_svm):
    print('Fetching new reviews')
    imdb_data_folder = 'aclImdb'
    imdb_sentiments = ['pos', 'neg']
    # get new imdb reviews
    new_reviews = get_reviews(imdb_data_folder, imdb_sentiments, ['new'])
    new_test_Y = new_reviews['sentiment']
    doc2vec_test_new = get_doc2vec_data(new_reviews['review'].values,
                                        doc2vec_model)
    doc2vec_acc_new = estimate_svm_accuracy(doc2vec_test_new, new_test_Y,
                                            doc2vec_svm)
    print('Doc2Vec acc with new data', doc2vec_acc_new)
    new_test_X, _ = get_bow_vectors(new_reviews['review'].values,
                                    min_count=4,
                                    max_frac=0.5,
                                    frequency=False,
                                    vectorizer=uni_vectorizer)
    new_bi_test_X, _ = get_bow_vectors(new_reviews['review'].values,
                                       min_count=7,
                                       max_frac=0.5,
                                       frequency=False,
                                       vectorizer=bi_vectorizer)
    new_conc_test_X = np.concatenate((new_test_X, new_bi_test_X), axis=1)
    bow_acc_new = estimate_svm_accuracy(new_conc_test_X, new_test_Y,
                                        uni_bi_bow_svm)
    print('BOW acc with new data', bow_acc_new)
    for i in range(len(new_reviews)):
        print('i:', i)
        print('review')
        print(new_reviews['review'].iloc[i])
        print('bow prediction',
              uni_bi_bow_svm.predict(new_conc_test_X[i].reshape(1, -1)))
        print('doc2vec prediction',
              doc2vec_svm.predict(doc2vec_test_new[i].reshape(1, -1)))
        print('correct label', new_reviews['sentiment'].iloc[i])
        print('-------------')
コード例 #9
0
def main():
    np.random.seed(123)
    imdb_data_folder = 'aclImdb'
    imdb_sentiments = ['pos', 'neg']
    subfolders = ['train', 'test']
    # get imdb reviews to train doc2vec with
    imdb_reviews = get_reviews(imdb_data_folder, imdb_sentiments, subfolders)
    reviews, _ = get_uni_and_bi_grams('data-tagged')
    review_data = build_data(reviews)
    # set a blind set aside for reporting results
    development_data, blind_test_set = get_train_test_split(0.9, review_data)
    test_Y = blind_test_set['sentiment'].values
    #find_optimal_doc2vec_hyperparams(imdb_reviews, development_data)
    ####### TRAINING MODELS AND RUNNING EXPERIMENTS #############
    doc2vec_train_X, doc2vec_train_Y, doc2vec_model = train_doc2vec_model(
        imdb_reviews,
        epochs=30,
        window_size=4,
        dm=0,
        dbow_words=1,
        pretrained=True,
        save=True)
    doc2vec_test_X = get_doc2vec_data(blind_test_set['review'].values,
                                      doc2vec_model)
    doc2vec_svm = build_svm_classifier(doc2vec_train_X,
                                       doc2vec_train_Y,
                                       kernel='rbf',
                                       gamma='scale')
    concat_doc2vec_train_X, concat_doc2vec_train_Y, concat_doc2vec_model = train_doc2vec_model(
        imdb_reviews,
        epochs=30,
        window_size=4,
        dm=1,
        dm_concat=1,
        pretrained=True,
        save=True)
    dm_doc2vec_train_X, dm_doc2vec_train_Y, dm_doc2vec_model = train_doc2vec_model(
        imdb_reviews,
        epochs=30,
        window_size=4,
        dm=1,
        pretrained=True,
        save=True)
    print('-----------')
    doc2vec_train_Xs = [
        doc2vec_train_X, dm_doc2vec_train_X, concat_doc2vec_train_X
    ]
    doc2vec_models = [doc2vec_model, dm_doc2vec_model, concat_doc2vec_model]
    get_cross_validated_baseline_accuracies(development_data,
                                            doc2vec_Xs=doc2vec_train_Xs,
                                            doc2vec_Y=doc2vec_train_Y,
                                            doc2vec_models=doc2vec_models)
    print('-----------')
    cross_validate_permutation_tests(development_data, imdb_reviews,
                                     doc2vec_train_Xs, doc2vec_train_Y,
                                     doc2vec_models)
    ###################### USING THE BEST MODELS #################
    print('---------Accuracies using the best models---------')
    doc2vec_acc = estimate_svm_accuracy(doc2vec_test_X, test_Y, doc2vec_svm)
    print('doc2vec accuracy', doc2vec_acc)
    X_pres, uni_vectorizer = get_bow_vectors(development_data['review'].values,
                                             min_count=4,
                                             max_frac=0.5,
                                             frequency=False)
    X_bi, bi_vectorizer = get_bow_vectors(development_data['review'].values,
                                          min_count=7,
                                          max_frac=0.5,
                                          frequency=False,
                                          bigrams=True)
    test_X, _ = get_bow_vectors(blind_test_set['review'].values,
                                min_count=4,
                                max_frac=0.5,
                                frequency=False,
                                vectorizer=uni_vectorizer)
    bi_test_X, _ = get_bow_vectors(blind_test_set['review'].values,
                                   min_count=7,
                                   max_frac=0.5,
                                   frequency=False,
                                   vectorizer=bi_vectorizer)
    conc_test_X = np.concatenate((test_X, bi_test_X), axis=1)
    uni_bi_bow_svm = build_svm_classifier(np.concatenate((X_pres, X_bi),
                                                         axis=1),
                                          development_data['sentiment'].values,
                                          kernel='linear',
                                          probability=True)
    bow_acc = estimate_svm_accuracy(conc_test_X, test_Y, uni_bi_bow_svm)
    print(
        'bow accuracy with presence and both uni and bigrams and a linear kernel',
        bow_acc)
    bow_svm = build_svm_classifier(X_pres,
                                   development_data['sentiment'].values,
                                   kernel='linear')
    bow_acc2 = estimate_svm_accuracy(test_X, test_Y, bow_svm)
    print('bow accuracy with presence, unigrams and a linear kernel', bow_acc2)
    print('-------------')
    print('Error analysis for Doc2Vec')
    model_error_analysis(doc2vec_test_X, blind_test_set, doc2vec_svm)
    evaluate_vector_qualities(doc2vec_test_X, test_Y)
    print('Error analysis for BOW with both unigrams and bigrams')
    model_error_analysis(conc_test_X, blind_test_set, uni_bi_bow_svm)
    evaluate_vector_qualities(conc_test_X, test_Y)
    print('----------------')
    print('Vector quality estimation')
    evaluate_vector_qualities(test_X, test_Y, model_imdb)
    do_test_visualisations(blind_test_set, conc_test_X, test_Y, doc2vec_test_X,
                           doc2vec_model)
    print('----------------')
    print('Plotting emotion sentences for intensification analysis')
    run_intensification_analysis(doc2vec_model)
    print('Running a deployment test')
    deployment_test(imdb_data_folder, doc2vec_model, doc2vec_svm,
                    uni_vectorizer, bi_vectorizer, uni_bi_bow_svm)
コード例 #10
0
def get_cross_validated_baseline_accuracies(development_data, doc2vec_Xs,
                                            doc2vec_Y, doc2vec_models):
    ############# BOW accuracies ##############
    test_Y = development_data['sentiment'].values
    X, _ = get_bow_vectors(development_data['review'].values,
                           min_count=4,
                           max_frac=0.5)
    X_pres, _ = get_bow_vectors(development_data['review'].values,
                                min_count=4,
                                max_frac=0.5,
                                frequency=False)
    X_low, _ = get_bow_vectors(development_data['review'].values,
                               min_count=4,
                               max_frac=0.5,
                               lowercase=False,
                               frequency=False)
    X_bi, _ = get_bow_vectors(development_data['review'].values,
                              min_count=7,
                              max_frac=0.5,
                              frequency=False,
                              bigrams=True)
    Y = development_data['sentiment'].to_numpy()
    acc6 = cross_validate_svm(X_bi, Y, kernel='linear', gamma='scale')
    print(
        'Cross validated bow accuracy when using a linear kernel, feature presence and bigrams',
        acc6)
    acc7 = cross_validate_svm(np.concatenate((X_bi, X_pres), axis=1),
                              Y,
                              kernel='linear',
                              gamma='scale')
    print(
        'Cross validated bow accuracy when using a linear kernel, feature presence and both unigrams and bigrams',
        acc7)
    acc5 = cross_validate_svm(X_low, Y, kernel='linear', gamma='scale')
    print(
        'Cross validated bow accuracy when using a linear kernel, feature presence and lowercased input',
        acc5)
    acc4 = cross_validate_svm(X_pres, Y, kernel='rbf', gamma='scale')
    print(
        'Cross validated bow accuracy when using a gaussian kernel and feature presence',
        acc4)
    acc3 = cross_validate_svm(X_pres, Y, kernel='linear', gamma='scale')
    print(
        'Cross validated bow accuracy when using a linear kernel and feature presence',
        acc3)
    acc1 = cross_validate_svm(X, Y, kernel='rbf', gamma='scale')
    print('Cross validated bow accuracy when using a gaussian kernel', acc1)
    acc2 = cross_validate_svm(X, Y, kernel='linear')
    print('Cross validated bow accuracy when using a linear kernel', acc2)
    ############# Doc2Vec accuracies #############
    svm3 = build_svm_classifier(doc2vec_Xs[2],
                                doc2vec_Y,
                                kernel='rbf',
                                gamma='scale')
    test_X3 = get_doc2vec_data(development_data['review'].values,
                               doc2vec_models[2])
    accuracy3 = estimate_svm_accuracy(test_X3, test_Y, svm3)
    print('Doc2Vec accuracy with a gaussian kernel and dm concat vectors',
          accuracy3)
    svm4 = build_svm_classifier(doc2vec_Xs[2],
                                doc2vec_Y,
                                kernel='linear',
                                gamma='scale')
    test_X4 = get_doc2vec_data(development_data['review'].values,
                               doc2vec_models[2])
    test_Y = development_data['sentiment'].values
    accuracy4 = estimate_svm_accuracy(test_X4, test_Y, svm4)
    print('Doc2Vec accuracy with a linear kernel and dm concat vectors',
          accuracy4)
    svm = build_svm_classifier(doc2vec_Xs[0],
                               doc2vec_Y,
                               kernel='rbf',
                               gamma='scale')
    test_X = get_doc2vec_data(development_data['review'].values,
                              doc2vec_models[0])
    accuracy = estimate_svm_accuracy(test_X, test_Y, svm)
    print('Doc2Vec accuracy with a gaussian kernel and dbow', accuracy)
    svm = build_svm_classifier(doc2vec_Xs[0],
                               doc2vec_Y,
                               kernel='linear',
                               gamma='scale')
    test_X = get_doc2vec_data(development_data['review'].values,
                              doc2vec_models[0])
    accuracy = estimate_svm_accuracy(test_X, test_Y, svm)
    print('Doc2Vec accuracy with a linear kernel and dbow', accuracy)
    test_X2 = get_doc2vec_data(development_data['review'].values,
                               doc2vec_models[1])
    test_concat_X = np.concatenate((test_X, test_X2), axis=1)
    svm2 = build_svm_classifier(np.concatenate((doc2vec_Xs[0], doc2vec_Xs[1]),
                                               axis=1),
                                doc2vec_Y,
                                kernel='rbf',
                                gamma='scale')
    accuracy2 = estimate_svm_accuracy(test_concat_X, test_Y, svm2)
    print('Doc2Vec accuracy with concatenated vectors and gaussian kernel',
          accuracy2)
    svm1 = build_svm_classifier(doc2vec_Xs[1],
                                doc2vec_Y,
                                kernel='linear',
                                gamma='scale')
    test_X = get_doc2vec_data(development_data['review'].values,
                              doc2vec_models[1])
    accuracy1 = estimate_svm_accuracy(test_X, test_Y, svm1)
    print('Doc2Vec accuracy with a linear kernel and dm', accuracy1)
    svm3 = build_svm_classifier(doc2vec_Xs[1],
                                doc2vec_Y,
                                kernel='rbf',
                                gamma='scale')
    test_X = get_doc2vec_data(development_data['review'].values,
                              doc2vec_models[1])
    accuracy3 = estimate_svm_accuracy(test_X, test_Y, svm3)
    print('Doc2Vec accuracy with a gaussian kernel and dm', accuracy3)
    test_X2 = get_doc2vec_data(development_data['review'].values,
                               doc2vec_models[1])
    test_concat_X = np.concatenate((test_X, test_X2), axis=1)
    svm4 = build_svm_classifier(np.concatenate((doc2vec_Xs[0], doc2vec_Xs[1]),
                                               axis=1),
                                doc2vec_Y,
                                kernel='linear',
                                gamma='scale')
    accuracy4 = estimate_svm_accuracy(test_concat_X, test_Y, svm4)
    print('Doc2Vec accuracy with concatenated vectors and linear kernel',
          accuracy4)