Example #1
0
def main(training_file, test_file, submission_file, ratio):
    data = utilities.read_file(training_file)
    test_data = utilities.read_file(test_file)

    print 'Preparing data...'
    x, y = preprocess.prepare_data(data)
    refid, x_test = preprocess.prepare_test_data(test_data)
    x, x_test = preprocess.preprocess_features(x, x_test)

    print 'Feature extracting...'
    x, x_test = feature_extraction.create_feature(x, y, x_test)

    indices = feature_extraction.get_best_k_feature_indices(x, y, 300)
    x = feature_extraction.get_best_k_features(x, indices)
    x_test = feature_extraction.get_best_k_features(x_test, indices)
    print 'Get %s features.' % len(x[0])

    x_train, x_cv, y_train, y_cv = cross_validation.train_test_split(
        x, y, test_size=.3, random_state=0)
    x_train, y_train = preprocess.down_sample(x_train, y_train, ratio)

    clf = classification.random_forest(x_train, y_train, x_cv, y_cv)

    print 'Predicting...'
    predict = clf.predict_proba(x_test)
    utilities.write_submission_file(submission_file, refid, predict)
Example #2
0
def main(training_file, test_file, submission_file, ratio):
    data = utilities.read_file(training_file)
    test_data = utilities.read_file(test_file)

    print 'Preparing data...'
    x, y = preprocess.prepare_data(data)
    refid, x_test = preprocess.prepare_test_data(test_data)
    x, x_test = preprocess.preprocess_features(x, x_test)

    print 'Feature extracting...'
    x, x_test = feature_extraction.create_feature(x, y, x_test)

    indices = feature_extraction.get_best_k_feature_indices(x, y, 300)
    x = feature_extraction.get_best_k_features(x, indices)
    x_test = feature_extraction.get_best_k_features(x_test, indices)
    print 'Get %s features.' % len(x[0])

    x_train, x_cv, y_train, y_cv = cross_validation.train_test_split(
        x, y, test_size=.3, random_state=0)
    x_train, y_train = preprocess.down_sample(x_train, y_train, ratio)

    clf = classification.random_forest(x_train, y_train, x_cv, y_cv)

    print 'Predicting...'
    predict = clf.predict_proba(x_test)
    utilities.write_submission_file(submission_file, refid, predict)
Example #3
0
def run_classifiers_with_doc2vec(reviews,
                                 scores,
                                 review_lengths,
                                 with_features=False):
    '''Corpus should be an array of TaggedDocument objects.'''
    corpus = list(embeddings.get_corpus(reviews, scores))[:20000]
    train_corpus, test_corpus = train_test_split(corpus,
                                                 test_size=0.25,
                                                 random_state=42)

    doc2vec_model = embeddings.create_doc2vec_model(train_corpus)
    train_targets, train_regressors = zip(*[(doc.words, doc.tags[0])
                                            for doc in train_corpus])
    test_targets, test_regressors = zip(*[(doc.words, doc.tags[0])
                                          for doc in test_corpus])
    '''
    For every review, we apply doc2vec_model.infer_vector(review). This creates
    a feature vector for every document (in our case, review) in the corpus.
    '''
    train_x, train_y = get_train_lists(doc2vec_model, train_targets,
                                       train_regressors, review_lengths)
    test_x, test_y = get_test_lists(doc2vec_model, test_targets,
                                    test_regressors)
    '''
    When the 'with_features' parameter=True, we add our extra features to the
    existing feature matrix.
    '''
    if with_features:
        prp_list = functions.create_pos_features(reviews)
        train_x = functions.add_pos_feature(train_x, prp_list)
        train_x = add_length_review_feature(train_x, review_lengths)
        test_x = functions.add_pos_feature(test_x, prp_list)
        test_x = add_length_review_feature(test_x, review_lengths)

    logistic_reg = classification.logistic_regression(train_x, train_y)
    k_nearest_n = classification.knearest_neighbors(train_x, train_y)
    decision_trees = classification.decision_trees(train_x, train_y)
    random_forest = classification.random_forest(train_x, train_y)

    classifiers = [logistic_reg, k_nearest_n, decision_trees, random_forest]

    for i in range(len(classifiers)):
        print("-------------------------------------------------")
        if i == 0:
            print("Logistic Regression\n")
        if i == 1:
            print("K Nearest Neighbors\n")
        if i == 2:
            print("Decision Trees\n")
        if i == 3:
            print("Random Forest\n")
        '''Train and predict on classifiers[i] for both training and testing data.'''
        functions.train_classifier_and_evaluate_accuracy_on_training_data(
            classifiers[i], train_x, train_y)
        functions.train_classifier_and_evaluate_accuracy_on_testing_data(
            classifiers[i], test_x, test_y)
        print('\n\n')
Example #4
0
def spring_brother(training_file, test_file, submission_file):
    """ Running on the test file. """

    y, meta_data = utilities.read_training_file(training_file)
    ids, meta_data_test = utilities.read_test_file(test_file)

    x_train, x_test = feature_selection.generate_features(meta_data,
        y, meta_data_test)

    clf = classification.random_forest(x_train, y, None, None)

    p = classification.get_prob(clf, x_test)
    utilities.write_submission_file(submission_file, ids, p)
Example #5
0
def spring_brother(training_file, test_file, submission_file):
    """ Running on the test file. """

    y, meta_data = utilities.read_training_file(training_file)
    ids, meta_data_test = utilities.read_test_file(test_file)

    x_train, x_test = feature_selection.generate_features(
        meta_data, y, meta_data_test)

    clf = classification.random_forest(x_train, y, None, None)

    p = classification.get_prob(clf, x_test)
    utilities.write_submission_file(submission_file, ids, p)
Example #6
0
def all_feature_classify(training_file, num):
    """ Classifier using all features. """

    y, meta_data = utilities.read_training_file(training_file)
    y, meta_data = utilities.sample(y, meta_data, num)

    meta_data_train, y_train, meta_data_cv, y_cv = \
        classification.prepare_data(meta_data, y)

    x_train, x_cv = feature_selection.generate_features(meta_data_train,
        y_train, meta_data_cv)

    clf = classification.random_forest(x_train, y_train, x_cv, y_cv)
    print utilities.binomial_deviance(y_train,
        classification.get_prob(clf, x_train))
    print utilities.binomial_deviance(y_cv, classification.get_prob(clf, x_cv))
Example #7
0
def all_feature_classify(training_file, num):
    """ Classifier using all features. """

    y, meta_data = utilities.read_training_file(training_file)
    y, meta_data = utilities.sample(y, meta_data, num)

    meta_data_train, y_train, meta_data_cv, y_cv = \
        classification.prepare_data(meta_data, y)

    x_train, x_cv = feature_selection.generate_features(
        meta_data_train, y_train, meta_data_cv)

    clf = classification.random_forest(x_train, y_train, x_cv, y_cv)
    print utilities.binomial_deviance(y_train,
                                      classification.get_prob(clf, x_train))
    print utilities.binomial_deviance(y_cv, classification.get_prob(clf, x_cv))
Example #8
0
def run_classifiers_with_bow(reviews,
                             scores,
                             review_lengths,
                             with_features=False):
    X, vectorizer = functions.create_bow_from_reviews(reviews)
    train_x, test_x, train_y, test_y = train_test_split(X,
                                                        scores,
                                                        test_size=0.25,
                                                        random_state=42)
    '''
    When the 'with_features' parameter=True, we add our extra features to the
    existing feature matrix.
    '''
    if with_features:
        '''Create 'Part of Speech' feature vector for each review'''
        prp_list = functions.create_pos_features(reviews)
        '''Add both the POS and Review Length vectors to features'''
        train_x = functions.add_pos_feature(train_x, prp_list)
        train_x = add_length_review_feature(train_x, review_lengths)
        '''Do the same for the testing features'''
        test_x = functions.add_pos_feature(test_x, prp_list)
        test_x = add_length_review_feature(test_x, review_lengths)
    '''Create each classifier with Training Features and Training Labels.'''
    logistic_reg = classification.logistic_regression(train_x, train_y)
    k_nearest_n = classification.knearest_neighbors(train_x, train_y)
    decision_trees = classification.decision_trees(train_x, train_y)
    random_forest = classification.random_forest(train_x, train_y)

    classifiers = [logistic_reg, k_nearest_n, decision_trees, random_forest]

    for i in range(len(classifiers)):
        print("-------------------------------------------------")
        if i == 0:
            print("Logistic Regression\n")
        if i == 1:
            print("K Nearest Neighbors\n")
        if i == 2:
            print("Decision Trees\n")
        if i == 3:
            print("Random Forest\n")
        '''Train and predict on classifiers[i] for both training and testing data.'''
        functions.train_classifier_and_evaluate_accuracy_on_training_data(
            classifiers[i], train_x, train_y)
        functions.train_classifier_and_evaluate_accuracy_on_testing_data(
            classifiers[i], test_x, test_y)
        print('\n\n')
Example #9
0
    # Adding length of a each review feature
    print("After adding length review feature")
    X = functions.add_length_review_feature(X, length_of_reviews)
    print(X)

    # Adding Part of Speech Tag Feature
    print("After adding Part of Speech Tag feature")
    prp_list = functions.create_pos_features(reviews)
    X = functions.add_pos_feature(X, prp_list)
    print(X)

    # Logistic Regression
    # --------------------------------------------
    classifier = classification.logistic_regression(X, scores)

    # Naive Bayes
    # --------------------------------------------
    classifier = classification.naive_bayes(X, scores)

    # K Nearest Neighbors
    # --------------------------------------------
    classifier = classification.knearest_neighbors(X, scores)

    # Decision Trees
    # --------------------------------------------
    classifier = classification.decision_trees(X, scores)

    # Random Forests
    # --------------------------------------------
    classifier = classification.random_forest(X, scores)
if __name__ == '__main__':
    print('Running Classifiers for dianping dataset')
    print("Does not include extra features")
    print("Using Bag of Words")
    print('------------------------------------------')

    stop = dianping.gather_stopwords()
    labels, reviews = dianping.read_chinese()

    BOW, vec = dianping.chinese_BOW(reviews, stop)

    # Logistic Regression
    # --------------------------------------------
    classifier = classification.logistic_regression(BOW, labels)

    # Naive Bayes
    # --------------------------------------------
    classifier = classification.naive_bayes(BOW, labels)

    # K Nearest Neighbors
    # --------------------------------------------
    classifier = classification.knearest_neighbors(BOW, labels)

    # Decision Trees
    # --------------------------------------------
    classifier = classification.decision_trees(BOW, labels)

    # Random Forests
    # --------------------------------------------
    classifier = classification.random_forest(BOW, labels)
Example #11
0
from classification import Naive_bayesian, KNN, random_forest, SVM
from sklearn.metrics import accuracy_score, classification_report
from classification import test_document_list, train_document_list
from search import Naive_bayesian

classes_test = [test_document[0] for test_document in test_document_list]
classes_pred_1 = Naive_bayesian(train_document_list[1:500],
                                test_document_list[1:20])
# print(classes_test, classes_pred_1)
print(classification_report(classes_test[1:20], classes_pred_1))
print(accuracy_score(classes_test[1:20], classes_pred_1))

classes_pred_2 = KNN(train_document_list[1:200], test_document_list[1:20], 5)
# print(classes_test[1:20], classes_pred_2)
print(
    classification_report([int(c) for c in classes_test[1:20]],
                          classes_pred_2))
print(accuracy_score(classes_test[1:20], classes_pred_2))

classes_pred_3 = SVM(train_document_list[1:500], test_document_list[1:20])
print(classification_report(classes_test[1:20], classes_pred_3))
print(accuracy_score(classes_test[1:20], classes_pred_3))

classes_pred_4 = random_forest(train_document_list[1:500],
                               test_document_list[1:20])
print(classification_report(classes_test[1:20], classes_pred_4))
print(accuracy_score(classes_test[1:20], classes_pred_4))
Example #12
0
def random_forest(input_dict):
    """Random Forest learner"""
    p = input_dict['params']
    return {'learner': classification.random_forest(p)}