def main(training_file, test_file, submission_file, ratio): data = utilities.read_file(training_file) test_data = utilities.read_file(test_file) print 'Preparing data...' x, y = preprocess.prepare_data(data) refid, x_test = preprocess.prepare_test_data(test_data) x, x_test = preprocess.preprocess_features(x, x_test) print 'Feature extracting...' x, x_test = feature_extraction.create_feature(x, y, x_test) indices = feature_extraction.get_best_k_feature_indices(x, y, 300) x = feature_extraction.get_best_k_features(x, indices) x_test = feature_extraction.get_best_k_features(x_test, indices) print 'Get %s features.' % len(x[0]) x_train, x_cv, y_train, y_cv = cross_validation.train_test_split( x, y, test_size=.3, random_state=0) x_train, y_train = preprocess.down_sample(x_train, y_train, ratio) clf = classification.random_forest(x_train, y_train, x_cv, y_cv) print 'Predicting...' predict = clf.predict_proba(x_test) utilities.write_submission_file(submission_file, refid, predict)
def run_classifiers_with_doc2vec(reviews, scores, review_lengths, with_features=False): '''Corpus should be an array of TaggedDocument objects.''' corpus = list(embeddings.get_corpus(reviews, scores))[:20000] train_corpus, test_corpus = train_test_split(corpus, test_size=0.25, random_state=42) doc2vec_model = embeddings.create_doc2vec_model(train_corpus) train_targets, train_regressors = zip(*[(doc.words, doc.tags[0]) for doc in train_corpus]) test_targets, test_regressors = zip(*[(doc.words, doc.tags[0]) for doc in test_corpus]) ''' For every review, we apply doc2vec_model.infer_vector(review). This creates a feature vector for every document (in our case, review) in the corpus. ''' train_x, train_y = get_train_lists(doc2vec_model, train_targets, train_regressors, review_lengths) test_x, test_y = get_test_lists(doc2vec_model, test_targets, test_regressors) ''' When the 'with_features' parameter=True, we add our extra features to the existing feature matrix. ''' if with_features: prp_list = functions.create_pos_features(reviews) train_x = functions.add_pos_feature(train_x, prp_list) train_x = add_length_review_feature(train_x, review_lengths) test_x = functions.add_pos_feature(test_x, prp_list) test_x = add_length_review_feature(test_x, review_lengths) logistic_reg = classification.logistic_regression(train_x, train_y) k_nearest_n = classification.knearest_neighbors(train_x, train_y) decision_trees = classification.decision_trees(train_x, train_y) random_forest = classification.random_forest(train_x, train_y) classifiers = [logistic_reg, k_nearest_n, decision_trees, random_forest] for i in range(len(classifiers)): print("-------------------------------------------------") if i == 0: print("Logistic Regression\n") if i == 1: print("K Nearest Neighbors\n") if i == 2: print("Decision Trees\n") if i == 3: print("Random Forest\n") '''Train and predict on classifiers[i] for both training and testing data.''' functions.train_classifier_and_evaluate_accuracy_on_training_data( classifiers[i], train_x, train_y) functions.train_classifier_and_evaluate_accuracy_on_testing_data( classifiers[i], test_x, test_y) print('\n\n')
def spring_brother(training_file, test_file, submission_file): """ Running on the test file. """ y, meta_data = utilities.read_training_file(training_file) ids, meta_data_test = utilities.read_test_file(test_file) x_train, x_test = feature_selection.generate_features(meta_data, y, meta_data_test) clf = classification.random_forest(x_train, y, None, None) p = classification.get_prob(clf, x_test) utilities.write_submission_file(submission_file, ids, p)
def spring_brother(training_file, test_file, submission_file): """ Running on the test file. """ y, meta_data = utilities.read_training_file(training_file) ids, meta_data_test = utilities.read_test_file(test_file) x_train, x_test = feature_selection.generate_features( meta_data, y, meta_data_test) clf = classification.random_forest(x_train, y, None, None) p = classification.get_prob(clf, x_test) utilities.write_submission_file(submission_file, ids, p)
def all_feature_classify(training_file, num): """ Classifier using all features. """ y, meta_data = utilities.read_training_file(training_file) y, meta_data = utilities.sample(y, meta_data, num) meta_data_train, y_train, meta_data_cv, y_cv = \ classification.prepare_data(meta_data, y) x_train, x_cv = feature_selection.generate_features(meta_data_train, y_train, meta_data_cv) clf = classification.random_forest(x_train, y_train, x_cv, y_cv) print utilities.binomial_deviance(y_train, classification.get_prob(clf, x_train)) print utilities.binomial_deviance(y_cv, classification.get_prob(clf, x_cv))
def all_feature_classify(training_file, num): """ Classifier using all features. """ y, meta_data = utilities.read_training_file(training_file) y, meta_data = utilities.sample(y, meta_data, num) meta_data_train, y_train, meta_data_cv, y_cv = \ classification.prepare_data(meta_data, y) x_train, x_cv = feature_selection.generate_features( meta_data_train, y_train, meta_data_cv) clf = classification.random_forest(x_train, y_train, x_cv, y_cv) print utilities.binomial_deviance(y_train, classification.get_prob(clf, x_train)) print utilities.binomial_deviance(y_cv, classification.get_prob(clf, x_cv))
def run_classifiers_with_bow(reviews, scores, review_lengths, with_features=False): X, vectorizer = functions.create_bow_from_reviews(reviews) train_x, test_x, train_y, test_y = train_test_split(X, scores, test_size=0.25, random_state=42) ''' When the 'with_features' parameter=True, we add our extra features to the existing feature matrix. ''' if with_features: '''Create 'Part of Speech' feature vector for each review''' prp_list = functions.create_pos_features(reviews) '''Add both the POS and Review Length vectors to features''' train_x = functions.add_pos_feature(train_x, prp_list) train_x = add_length_review_feature(train_x, review_lengths) '''Do the same for the testing features''' test_x = functions.add_pos_feature(test_x, prp_list) test_x = add_length_review_feature(test_x, review_lengths) '''Create each classifier with Training Features and Training Labels.''' logistic_reg = classification.logistic_regression(train_x, train_y) k_nearest_n = classification.knearest_neighbors(train_x, train_y) decision_trees = classification.decision_trees(train_x, train_y) random_forest = classification.random_forest(train_x, train_y) classifiers = [logistic_reg, k_nearest_n, decision_trees, random_forest] for i in range(len(classifiers)): print("-------------------------------------------------") if i == 0: print("Logistic Regression\n") if i == 1: print("K Nearest Neighbors\n") if i == 2: print("Decision Trees\n") if i == 3: print("Random Forest\n") '''Train and predict on classifiers[i] for both training and testing data.''' functions.train_classifier_and_evaluate_accuracy_on_training_data( classifiers[i], train_x, train_y) functions.train_classifier_and_evaluate_accuracy_on_testing_data( classifiers[i], test_x, test_y) print('\n\n')
# Adding length of a each review feature print("After adding length review feature") X = functions.add_length_review_feature(X, length_of_reviews) print(X) # Adding Part of Speech Tag Feature print("After adding Part of Speech Tag feature") prp_list = functions.create_pos_features(reviews) X = functions.add_pos_feature(X, prp_list) print(X) # Logistic Regression # -------------------------------------------- classifier = classification.logistic_regression(X, scores) # Naive Bayes # -------------------------------------------- classifier = classification.naive_bayes(X, scores) # K Nearest Neighbors # -------------------------------------------- classifier = classification.knearest_neighbors(X, scores) # Decision Trees # -------------------------------------------- classifier = classification.decision_trees(X, scores) # Random Forests # -------------------------------------------- classifier = classification.random_forest(X, scores)
if __name__ == '__main__': print('Running Classifiers for dianping dataset') print("Does not include extra features") print("Using Bag of Words") print('------------------------------------------') stop = dianping.gather_stopwords() labels, reviews = dianping.read_chinese() BOW, vec = dianping.chinese_BOW(reviews, stop) # Logistic Regression # -------------------------------------------- classifier = classification.logistic_regression(BOW, labels) # Naive Bayes # -------------------------------------------- classifier = classification.naive_bayes(BOW, labels) # K Nearest Neighbors # -------------------------------------------- classifier = classification.knearest_neighbors(BOW, labels) # Decision Trees # -------------------------------------------- classifier = classification.decision_trees(BOW, labels) # Random Forests # -------------------------------------------- classifier = classification.random_forest(BOW, labels)
from classification import Naive_bayesian, KNN, random_forest, SVM from sklearn.metrics import accuracy_score, classification_report from classification import test_document_list, train_document_list from search import Naive_bayesian classes_test = [test_document[0] for test_document in test_document_list] classes_pred_1 = Naive_bayesian(train_document_list[1:500], test_document_list[1:20]) # print(classes_test, classes_pred_1) print(classification_report(classes_test[1:20], classes_pred_1)) print(accuracy_score(classes_test[1:20], classes_pred_1)) classes_pred_2 = KNN(train_document_list[1:200], test_document_list[1:20], 5) # print(classes_test[1:20], classes_pred_2) print( classification_report([int(c) for c in classes_test[1:20]], classes_pred_2)) print(accuracy_score(classes_test[1:20], classes_pred_2)) classes_pred_3 = SVM(train_document_list[1:500], test_document_list[1:20]) print(classification_report(classes_test[1:20], classes_pred_3)) print(accuracy_score(classes_test[1:20], classes_pred_3)) classes_pred_4 = random_forest(train_document_list[1:500], test_document_list[1:20]) print(classification_report(classes_test[1:20], classes_pred_4)) print(accuracy_score(classes_test[1:20], classes_pred_4))
def random_forest(input_dict): """Random Forest learner""" p = input_dict['params'] return {'learner': classification.random_forest(p)}