Ejemplo n.º 1
0
def run_permutation_test_two_different_doc2vecs(train, test, kernel1, kernel2,
                                                model1, model2, d2v_X_1,
                                                d2v_X_2, d2v_Y1, d2v_Y2):
    """ Run a permutation test to compare bow and doc2vec svms. """
    # prepare test data for both svms
    test_X1 = get_doc2vec_data(test['review'].values, model1)
    test_X2 = get_doc2vec_data(test['review'].values, model2)
    # build models and predict
    svm1 = build_svm_classifier(d2v_X_1, d2v_Y1, kernel=kernel1, gamma='scale')
    svm2 = build_svm_classifier(d2v_X_2, d2v_Y2, kernel=kernel2, gamma='scale')
    test_Y = test['sentiment'].to_numpy()
    return permutation_test(test_Y, svm1.predict(test_X1),
                            svm2.predict(test_X2))
Ejemplo n.º 2
0
def run_permutation_test_bow_vs_doc2vec(train, test, bow_kernel,
                                        doc2vec_kernel, doc2vec_train_X,
                                        doc2vec_train_Y, doc2vec_model):
    """ Run a permutation test to compare bow and doc2vec svms. """
    # prepare data for both svms
    train_Y = train['sentiment'].to_numpy()
    test_Y = test['sentiment'].to_numpy()
    bow_train_X, vectorizer = get_bow_vectors(train['review'].values,
                                              min_count=4,
                                              max_frac=0.5,
                                              frequency=False)
    bow_test_X, _ = get_bow_vectors(test['review'].values,
                                    vectorizer=vectorizer)
    doc2vec_test_X = get_doc2vec_data(test['review'].values, doc2vec_model)
    # build models and predict
    bow_svm = build_svm_classifier(bow_train_X,
                                   train_Y,
                                   kernel=bow_kernel,
                                   gamma='scale')
    doc2vec_svm = build_svm_classifier(doc2vec_train_X,
                                       doc2vec_train_Y,
                                       kernel=doc2vec_kernel,
                                       gamma='scale')
    # delete big variables no longer used
    del bow_train_X
    return permutation_test(test_Y, bow_svm.predict(bow_test_X),
                            doc2vec_svm.predict(doc2vec_test_X))
Ejemplo n.º 3
0
def run_permutation_concatenated_vs_simple_doc2vec(train, test, kernel1,
                                                   kernel2, model1, model2,
                                                   d2v_X_1, d2v_X_2, d2v_Y):
    """ Run a permutation test to compare bow and doc2vec svms. """
    # prepare test data for both svms
    test_X1 = get_doc2vec_data(test['review'].values, model1)
    test_X2 = get_doc2vec_data(test['review'].values, model2)
    test_concat_X = np.concatenate((test_X1, test_X2), axis=1)
    # build models and predict
    svm1 = build_svm_classifier(d2v_X_1, d2v_Y, kernel=kernel1, gamma='scale')
    svm2 = build_svm_classifier(np.concatenate((d2v_X_1, d2v_X_2), axis=1),
                                d2v_Y,
                                kernel=kernel2,
                                gamma='scale')
    test_Y = test['sentiment'].to_numpy()
    return permutation_test(test_Y, svm1.predict(test_X1),
                            svm2.predict(test_concat_X))
Ejemplo n.º 4
0
def find_optimal_doc2vec_hyperparams(imdb_reviews, dev_data):
    maxim = -1
    max_params = {}
    for window_size in [12]:
        for epochs in [30]:
            for dm in [0]:
                for vec_size in [100]:
                    print('-----------------------------------------')
                    print('window size', window_size, 'epochs', epochs, 'dm',
                          dm, 'vec size', vec_size)
                    train_X, train_Y, model_imdb = train_doc2vec_model(
                        imdb_reviews,
                        epochs=epochs,
                        vec_size=vec_size,
                        window_size=window_size,
                        dm=dm)
                    test_X = get_doc2vec_data(dev_data['review'].values,
                                              model_imdb)
                    test_Y = dev_data['sentiment'].values
                    print('For test set vectors inferred with doc2vec')
                    svm1 = build_svm_classifier(train_X,
                                                train_Y,
                                                kernel='linear')
                    accuracy1 = estimate_svm_accuracy(test_X, test_Y, svm1)
                    print(
                        'accuracy using doc2vec and svm with a linear kernel',
                        accuracy1)
                    svm2 = build_svm_classifier(train_X,
                                                train_Y,
                                                kernel='rbf',
                                                gamma='scale')
                    accuracy2 = estimate_svm_accuracy(test_X, test_Y, svm2)
                    print(
                        'accuracy using doc2vec and svm with a gaussian kernel',
                        accuracy2)
                    if accuracy1 > maxim:
                        maxim = accuracy1
                        max_params = {
                            'kernel': 'linear',
                            'dm': dm,
                            'epochs': epochs,
                            'window_size': window_size,
                            'vec_size': vec_size
                        }
                    if accuracy2 > maxim:
                        maxim = accuracy2
                        max_params = {
                            'kernel': 'rbf',
                            'dm': dm,
                            'epochs': epochs,
                            'window_size': window_size,
                            'vec_size': vec_size
                        }
                    print('-----------------------------------------')
    print('Max accuracy was', maxim, 'with params', max_params)
Ejemplo n.º 5
0
def run_permutation_test_two_doc2vec_kernels(train, test, kernel1, kernel2,
                                             model, d2v_X, d2v_Y, **kwargs):
    """ Run a permutation test to compare bow and doc2vec svms. """
    # prepare test data
    test_X = get_doc2vec_data(test['review'].values, model)
    # build models and predict
    svm1 = build_svm_classifier(d2v_X,
                                d2v_Y,
                                kernel=kernel1,
                                gamma='scale',
                                **kwargs)
    svm2 = build_svm_classifier(d2v_X, d2v_Y, kernel=kernel2, gamma='scale')
    test_Y = test['sentiment'].to_numpy()
    return permutation_test(test_Y, svm1.predict(test_X), svm2.predict(test_X))
def visualise_individual_reviews(indices,
                                 imdb_reviews,
                                 doc2vec_model,
                                 title=''):
    revs = imdb_reviews['review'].values
    labels = imdb_reviews['sentiment'].values
    for i in indices:
        review = revs[i]
        sent = labels[i]
        print('Review')
        print(review)
        rev_X = get_doc2vec_data([review], doc2vec_model)
        vocab = [word for word in doc_tokenize(review)]
        doc2vec_visualisation(rev_X, [sent], doc2vec_model, vocab, title=title)
Ejemplo n.º 7
0
def deployment_test(imdb_data_folder, doc2vec_model, doc2vec_svm,
                    uni_vectorizer, bi_vectorizer, uni_bi_bow_svm):
    print('Fetching new reviews')
    imdb_data_folder = 'aclImdb'
    imdb_sentiments = ['pos', 'neg']
    # get new imdb reviews
    new_reviews = get_reviews(imdb_data_folder, imdb_sentiments, ['new'])
    new_test_Y = new_reviews['sentiment']
    doc2vec_test_new = get_doc2vec_data(new_reviews['review'].values,
                                        doc2vec_model)
    doc2vec_acc_new = estimate_svm_accuracy(doc2vec_test_new, new_test_Y,
                                            doc2vec_svm)
    print('Doc2Vec acc with new data', doc2vec_acc_new)
    new_test_X, _ = get_bow_vectors(new_reviews['review'].values,
                                    min_count=4,
                                    max_frac=0.5,
                                    frequency=False,
                                    vectorizer=uni_vectorizer)
    new_bi_test_X, _ = get_bow_vectors(new_reviews['review'].values,
                                       min_count=7,
                                       max_frac=0.5,
                                       frequency=False,
                                       vectorizer=bi_vectorizer)
    new_conc_test_X = np.concatenate((new_test_X, new_bi_test_X), axis=1)
    bow_acc_new = estimate_svm_accuracy(new_conc_test_X, new_test_Y,
                                        uni_bi_bow_svm)
    print('BOW acc with new data', bow_acc_new)
    for i in range(len(new_reviews)):
        print('i:', i)
        print('review')
        print(new_reviews['review'].iloc[i])
        print('bow prediction',
              uni_bi_bow_svm.predict(new_conc_test_X[i].reshape(1, -1)))
        print('doc2vec prediction',
              doc2vec_svm.predict(doc2vec_test_new[i].reshape(1, -1)))
        print('correct label', new_reviews['sentiment'].iloc[i])
        print('-------------')
Ejemplo n.º 8
0
def main():
    np.random.seed(123)
    imdb_data_folder = 'aclImdb'
    imdb_sentiments = ['pos', 'neg']
    subfolders = ['train', 'test']
    # get imdb reviews to train doc2vec with
    imdb_reviews = get_reviews(imdb_data_folder, imdb_sentiments, subfolders)
    reviews, _ = get_uni_and_bi_grams('data-tagged')
    review_data = build_data(reviews)
    # set a blind set aside for reporting results
    development_data, blind_test_set = get_train_test_split(0.9, review_data)
    test_Y = blind_test_set['sentiment'].values
    #find_optimal_doc2vec_hyperparams(imdb_reviews, development_data)
    ####### TRAINING MODELS AND RUNNING EXPERIMENTS #############
    doc2vec_train_X, doc2vec_train_Y, doc2vec_model = train_doc2vec_model(
        imdb_reviews,
        epochs=30,
        window_size=4,
        dm=0,
        dbow_words=1,
        pretrained=True,
        save=True)
    doc2vec_test_X = get_doc2vec_data(blind_test_set['review'].values,
                                      doc2vec_model)
    doc2vec_svm = build_svm_classifier(doc2vec_train_X,
                                       doc2vec_train_Y,
                                       kernel='rbf',
                                       gamma='scale')
    concat_doc2vec_train_X, concat_doc2vec_train_Y, concat_doc2vec_model = train_doc2vec_model(
        imdb_reviews,
        epochs=30,
        window_size=4,
        dm=1,
        dm_concat=1,
        pretrained=True,
        save=True)
    dm_doc2vec_train_X, dm_doc2vec_train_Y, dm_doc2vec_model = train_doc2vec_model(
        imdb_reviews,
        epochs=30,
        window_size=4,
        dm=1,
        pretrained=True,
        save=True)
    print('-----------')
    doc2vec_train_Xs = [
        doc2vec_train_X, dm_doc2vec_train_X, concat_doc2vec_train_X
    ]
    doc2vec_models = [doc2vec_model, dm_doc2vec_model, concat_doc2vec_model]
    get_cross_validated_baseline_accuracies(development_data,
                                            doc2vec_Xs=doc2vec_train_Xs,
                                            doc2vec_Y=doc2vec_train_Y,
                                            doc2vec_models=doc2vec_models)
    print('-----------')
    cross_validate_permutation_tests(development_data, imdb_reviews,
                                     doc2vec_train_Xs, doc2vec_train_Y,
                                     doc2vec_models)
    ###################### USING THE BEST MODELS #################
    print('---------Accuracies using the best models---------')
    doc2vec_acc = estimate_svm_accuracy(doc2vec_test_X, test_Y, doc2vec_svm)
    print('doc2vec accuracy', doc2vec_acc)
    X_pres, uni_vectorizer = get_bow_vectors(development_data['review'].values,
                                             min_count=4,
                                             max_frac=0.5,
                                             frequency=False)
    X_bi, bi_vectorizer = get_bow_vectors(development_data['review'].values,
                                          min_count=7,
                                          max_frac=0.5,
                                          frequency=False,
                                          bigrams=True)
    test_X, _ = get_bow_vectors(blind_test_set['review'].values,
                                min_count=4,
                                max_frac=0.5,
                                frequency=False,
                                vectorizer=uni_vectorizer)
    bi_test_X, _ = get_bow_vectors(blind_test_set['review'].values,
                                   min_count=7,
                                   max_frac=0.5,
                                   frequency=False,
                                   vectorizer=bi_vectorizer)
    conc_test_X = np.concatenate((test_X, bi_test_X), axis=1)
    uni_bi_bow_svm = build_svm_classifier(np.concatenate((X_pres, X_bi),
                                                         axis=1),
                                          development_data['sentiment'].values,
                                          kernel='linear',
                                          probability=True)
    bow_acc = estimate_svm_accuracy(conc_test_X, test_Y, uni_bi_bow_svm)
    print(
        'bow accuracy with presence and both uni and bigrams and a linear kernel',
        bow_acc)
    bow_svm = build_svm_classifier(X_pres,
                                   development_data['sentiment'].values,
                                   kernel='linear')
    bow_acc2 = estimate_svm_accuracy(test_X, test_Y, bow_svm)
    print('bow accuracy with presence, unigrams and a linear kernel', bow_acc2)
    print('-------------')
    print('Error analysis for Doc2Vec')
    model_error_analysis(doc2vec_test_X, blind_test_set, doc2vec_svm)
    evaluate_vector_qualities(doc2vec_test_X, test_Y)
    print('Error analysis for BOW with both unigrams and bigrams')
    model_error_analysis(conc_test_X, blind_test_set, uni_bi_bow_svm)
    evaluate_vector_qualities(conc_test_X, test_Y)
    print('----------------')
    print('Vector quality estimation')
    evaluate_vector_qualities(test_X, test_Y, model_imdb)
    do_test_visualisations(blind_test_set, conc_test_X, test_Y, doc2vec_test_X,
                           doc2vec_model)
    print('----------------')
    print('Plotting emotion sentences for intensification analysis')
    run_intensification_analysis(doc2vec_model)
    print('Running a deployment test')
    deployment_test(imdb_data_folder, doc2vec_model, doc2vec_svm,
                    uni_vectorizer, bi_vectorizer, uni_bi_bow_svm)
Ejemplo n.º 9
0
def analyse_emotion_statements(emotion_statement, statement, doc2vec_model):
    emotion_vector = get_doc2vec_data([emotion_statement], doc2vec_model)
    vector = get_doc2vec_data([statement], doc2vec_model)
    heat_plot_two_vectors(emotion_vector, emotion_statement, vector, statement)
Ejemplo n.º 10
0
def get_cross_validated_baseline_accuracies(development_data, doc2vec_Xs,
                                            doc2vec_Y, doc2vec_models):
    ############# BOW accuracies ##############
    test_Y = development_data['sentiment'].values
    X, _ = get_bow_vectors(development_data['review'].values,
                           min_count=4,
                           max_frac=0.5)
    X_pres, _ = get_bow_vectors(development_data['review'].values,
                                min_count=4,
                                max_frac=0.5,
                                frequency=False)
    X_low, _ = get_bow_vectors(development_data['review'].values,
                               min_count=4,
                               max_frac=0.5,
                               lowercase=False,
                               frequency=False)
    X_bi, _ = get_bow_vectors(development_data['review'].values,
                              min_count=7,
                              max_frac=0.5,
                              frequency=False,
                              bigrams=True)
    Y = development_data['sentiment'].to_numpy()
    acc6 = cross_validate_svm(X_bi, Y, kernel='linear', gamma='scale')
    print(
        'Cross validated bow accuracy when using a linear kernel, feature presence and bigrams',
        acc6)
    acc7 = cross_validate_svm(np.concatenate((X_bi, X_pres), axis=1),
                              Y,
                              kernel='linear',
                              gamma='scale')
    print(
        'Cross validated bow accuracy when using a linear kernel, feature presence and both unigrams and bigrams',
        acc7)
    acc5 = cross_validate_svm(X_low, Y, kernel='linear', gamma='scale')
    print(
        'Cross validated bow accuracy when using a linear kernel, feature presence and lowercased input',
        acc5)
    acc4 = cross_validate_svm(X_pres, Y, kernel='rbf', gamma='scale')
    print(
        'Cross validated bow accuracy when using a gaussian kernel and feature presence',
        acc4)
    acc3 = cross_validate_svm(X_pres, Y, kernel='linear', gamma='scale')
    print(
        'Cross validated bow accuracy when using a linear kernel and feature presence',
        acc3)
    acc1 = cross_validate_svm(X, Y, kernel='rbf', gamma='scale')
    print('Cross validated bow accuracy when using a gaussian kernel', acc1)
    acc2 = cross_validate_svm(X, Y, kernel='linear')
    print('Cross validated bow accuracy when using a linear kernel', acc2)
    ############# Doc2Vec accuracies #############
    svm3 = build_svm_classifier(doc2vec_Xs[2],
                                doc2vec_Y,
                                kernel='rbf',
                                gamma='scale')
    test_X3 = get_doc2vec_data(development_data['review'].values,
                               doc2vec_models[2])
    accuracy3 = estimate_svm_accuracy(test_X3, test_Y, svm3)
    print('Doc2Vec accuracy with a gaussian kernel and dm concat vectors',
          accuracy3)
    svm4 = build_svm_classifier(doc2vec_Xs[2],
                                doc2vec_Y,
                                kernel='linear',
                                gamma='scale')
    test_X4 = get_doc2vec_data(development_data['review'].values,
                               doc2vec_models[2])
    test_Y = development_data['sentiment'].values
    accuracy4 = estimate_svm_accuracy(test_X4, test_Y, svm4)
    print('Doc2Vec accuracy with a linear kernel and dm concat vectors',
          accuracy4)
    svm = build_svm_classifier(doc2vec_Xs[0],
                               doc2vec_Y,
                               kernel='rbf',
                               gamma='scale')
    test_X = get_doc2vec_data(development_data['review'].values,
                              doc2vec_models[0])
    accuracy = estimate_svm_accuracy(test_X, test_Y, svm)
    print('Doc2Vec accuracy with a gaussian kernel and dbow', accuracy)
    svm = build_svm_classifier(doc2vec_Xs[0],
                               doc2vec_Y,
                               kernel='linear',
                               gamma='scale')
    test_X = get_doc2vec_data(development_data['review'].values,
                              doc2vec_models[0])
    accuracy = estimate_svm_accuracy(test_X, test_Y, svm)
    print('Doc2Vec accuracy with a linear kernel and dbow', accuracy)
    test_X2 = get_doc2vec_data(development_data['review'].values,
                               doc2vec_models[1])
    test_concat_X = np.concatenate((test_X, test_X2), axis=1)
    svm2 = build_svm_classifier(np.concatenate((doc2vec_Xs[0], doc2vec_Xs[1]),
                                               axis=1),
                                doc2vec_Y,
                                kernel='rbf',
                                gamma='scale')
    accuracy2 = estimate_svm_accuracy(test_concat_X, test_Y, svm2)
    print('Doc2Vec accuracy with concatenated vectors and gaussian kernel',
          accuracy2)
    svm1 = build_svm_classifier(doc2vec_Xs[1],
                                doc2vec_Y,
                                kernel='linear',
                                gamma='scale')
    test_X = get_doc2vec_data(development_data['review'].values,
                              doc2vec_models[1])
    accuracy1 = estimate_svm_accuracy(test_X, test_Y, svm1)
    print('Doc2Vec accuracy with a linear kernel and dm', accuracy1)
    svm3 = build_svm_classifier(doc2vec_Xs[1],
                                doc2vec_Y,
                                kernel='rbf',
                                gamma='scale')
    test_X = get_doc2vec_data(development_data['review'].values,
                              doc2vec_models[1])
    accuracy3 = estimate_svm_accuracy(test_X, test_Y, svm3)
    print('Doc2Vec accuracy with a gaussian kernel and dm', accuracy3)
    test_X2 = get_doc2vec_data(development_data['review'].values,
                               doc2vec_models[1])
    test_concat_X = np.concatenate((test_X, test_X2), axis=1)
    svm4 = build_svm_classifier(np.concatenate((doc2vec_Xs[0], doc2vec_Xs[1]),
                                               axis=1),
                                doc2vec_Y,
                                kernel='linear',
                                gamma='scale')
    accuracy4 = estimate_svm_accuracy(test_concat_X, test_Y, svm4)
    print('Doc2Vec accuracy with concatenated vectors and linear kernel',
          accuracy4)