Example #1
0
def run_classifiers_with_doc2vec(reviews,
                                 scores,
                                 review_lengths,
                                 with_features=False):
    '''Corpus should be an array of TaggedDocument objects.'''
    corpus = list(embeddings.get_corpus(reviews, scores))[:20000]
    train_corpus, test_corpus = train_test_split(corpus,
                                                 test_size=0.25,
                                                 random_state=42)

    doc2vec_model = embeddings.create_doc2vec_model(train_corpus)
    train_targets, train_regressors = zip(*[(doc.words, doc.tags[0])
                                            for doc in train_corpus])
    test_targets, test_regressors = zip(*[(doc.words, doc.tags[0])
                                          for doc in test_corpus])
    '''
    For every review, we apply doc2vec_model.infer_vector(review). This creates
    a feature vector for every document (in our case, review) in the corpus.
    '''
    train_x, train_y = get_train_lists(doc2vec_model, train_targets,
                                       train_regressors, review_lengths)
    test_x, test_y = get_test_lists(doc2vec_model, test_targets,
                                    test_regressors)
    '''
    When the 'with_features' parameter=True, we add our extra features to the
    existing feature matrix.
    '''
    if with_features:
        prp_list = functions.create_pos_features(reviews)
        train_x = functions.add_pos_feature(train_x, prp_list)
        train_x = add_length_review_feature(train_x, review_lengths)
        test_x = functions.add_pos_feature(test_x, prp_list)
        test_x = add_length_review_feature(test_x, review_lengths)

    logistic_reg = classification.logistic_regression(train_x, train_y)
    k_nearest_n = classification.knearest_neighbors(train_x, train_y)
    decision_trees = classification.decision_trees(train_x, train_y)
    random_forest = classification.random_forest(train_x, train_y)

    classifiers = [logistic_reg, k_nearest_n, decision_trees, random_forest]

    for i in range(len(classifiers)):
        print("-------------------------------------------------")
        if i == 0:
            print("Logistic Regression\n")
        if i == 1:
            print("K Nearest Neighbors\n")
        if i == 2:
            print("Decision Trees\n")
        if i == 3:
            print("Random Forest\n")
        '''Train and predict on classifiers[i] for both training and testing data.'''
        functions.train_classifier_and_evaluate_accuracy_on_training_data(
            classifiers[i], train_x, train_y)
        functions.train_classifier_and_evaluate_accuracy_on_testing_data(
            classifiers[i], test_x, test_y)
        print('\n\n')
Example #2
0
def run_classifiers_with_bow(reviews,
                             scores,
                             review_lengths,
                             with_features=False):
    X, vectorizer = functions.create_bow_from_reviews(reviews)
    train_x, test_x, train_y, test_y = train_test_split(X,
                                                        scores,
                                                        test_size=0.25,
                                                        random_state=42)
    '''
    When the 'with_features' parameter=True, we add our extra features to the
    existing feature matrix.
    '''
    if with_features:
        '''Create 'Part of Speech' feature vector for each review'''
        prp_list = functions.create_pos_features(reviews)
        '''Add both the POS and Review Length vectors to features'''
        train_x = functions.add_pos_feature(train_x, prp_list)
        train_x = add_length_review_feature(train_x, review_lengths)
        '''Do the same for the testing features'''
        test_x = functions.add_pos_feature(test_x, prp_list)
        test_x = add_length_review_feature(test_x, review_lengths)
    '''Create each classifier with Training Features and Training Labels.'''
    logistic_reg = classification.logistic_regression(train_x, train_y)
    k_nearest_n = classification.knearest_neighbors(train_x, train_y)
    decision_trees = classification.decision_trees(train_x, train_y)
    random_forest = classification.random_forest(train_x, train_y)

    classifiers = [logistic_reg, k_nearest_n, decision_trees, random_forest]

    for i in range(len(classifiers)):
        print("-------------------------------------------------")
        if i == 0:
            print("Logistic Regression\n")
        if i == 1:
            print("K Nearest Neighbors\n")
        if i == 2:
            print("Decision Trees\n")
        if i == 3:
            print("Random Forest\n")
        '''Train and predict on classifiers[i] for both training and testing data.'''
        functions.train_classifier_and_evaluate_accuracy_on_training_data(
            classifiers[i], train_x, train_y)
        functions.train_classifier_and_evaluate_accuracy_on_testing_data(
            classifiers[i], test_x, test_y)
        print('\n\n')
Example #3
0
    # Adding length of a each review feature
    print("After adding length review feature")
    X = functions.add_length_review_feature(X, length_of_reviews)
    print(X)

    # Adding Part of Speech Tag Feature
    print("After adding Part of Speech Tag feature")
    prp_list = functions.create_pos_features(reviews)
    X = functions.add_pos_feature(X, prp_list)
    print(X)

    # Logistic Regression
    # --------------------------------------------
    classifier = classification.logistic_regression(X, scores)

    # Naive Bayes
    # --------------------------------------------
    classifier = classification.naive_bayes(X, scores)

    # K Nearest Neighbors
    # --------------------------------------------
    classifier = classification.knearest_neighbors(X, scores)

    # Decision Trees
    # --------------------------------------------
    classifier = classification.decision_trees(X, scores)

    # Random Forests
    # --------------------------------------------
    classifier = classification.random_forest(X, scores)
if __name__ == '__main__':
    print('Running Classifiers for dianping dataset')
    print("Does not include extra features")
    print("Using Bag of Words")
    print('------------------------------------------')

    stop = dianping.gather_stopwords()
    labels, reviews = dianping.read_chinese()

    BOW, vec = dianping.chinese_BOW(reviews, stop)

    # Logistic Regression
    # --------------------------------------------
    classifier = classification.logistic_regression(BOW, labels)

    # Naive Bayes
    # --------------------------------------------
    classifier = classification.naive_bayes(BOW, labels)

    # K Nearest Neighbors
    # --------------------------------------------
    classifier = classification.knearest_neighbors(BOW, labels)

    # Decision Trees
    # --------------------------------------------
    classifier = classification.decision_trees(BOW, labels)

    # Random Forests
    # --------------------------------------------
    classifier = classification.random_forest(BOW, labels)