def main(): global tagger if constants.corpus == constants.Corpus.movie_review: neg_docs, pos_docs = get_movie_corpus() if constants.corpus == constants.Corpus.pol_debates: neg_docs, pos_docs = get_political_debates() if constants.mark_negation: neg_docs = [nltk.sentiment.util.mark_negation(doc) for doc in neg_docs] pos_docs = [nltk.sentiment.util.mark_negation(doc) for doc in pos_docs] # Split betweeen the training set and the testing set num_train_neg = int(3 / 4 * len(neg_docs)) num_test_neg = len(neg_docs) - num_train_neg num_train_pos = int(3 / 4 * len(pos_docs)) num_test_pos = len(pos_docs) - num_train_pos train_neg, test_neg = sklearn.cross_validation.train_test_split( neg_docs, train_size=num_train_neg, test_size=num_test_neg) train_pos, test_pos = sklearn.cross_validation.train_test_split( pos_docs, train_size=num_train_pos, test_size=num_test_pos) # Make the final train set and test set train_docs = train_pos + train_neg test_docs = test_pos + test_neg # Set up the Sentiment Analyzer analyzer = SentimentAnalyzer() if constants.feature_extractor == constants.FeatureExtractor.bag_of_words: analyzer.add_feat_extractor(extract_bag_of_words_feats) if constants.feature_extractor == constants.FeatureExtractor.freq_dist: analyzer.add_feat_extractor(extract_freq_dist) elif constants.feature_extractor == constants.FeatureExtractor.unigram: all_words = analyzer.all_words(train_docs, labeled=True) unigram_features = analyzer.unigram_word_feats(all_words, min_freq=1000) print("Length of unigram features: %d" % len(unigram_features)) analyzer.add_feat_extractor(nltk.sentiment.util.extract_unigram_feats, unigrams=unigram_features) elif constants.feature_extractor == constants.FeatureExtractor.bigram_bag_of_words: analyzer.add_feat_extractor(extract_sig_bigram_feats) elif constants.feature_extractor == constants.FeatureExtractor.adjective_bag_of_words: tagger = nltk.tag.HunposTagger(constants.hunpos_english_model) analyzer.add_feat_extractor(adjective_bag_of_words) elif constants.feature_extractor == constants.FeatureExtractor.pos_bag_of_words: tagger = nltk.tag.HunposTagger(constants.hunpos_english_model) analyzer.add_feat_extractor(adjective_bag_of_words) train_feat = list(analyzer.apply_features(train_docs, labeled=True)) test_feat = list(analyzer.apply_features(test_docs, labeled=True)) print('train on %d instances, test on %d instances' % (len(train_feat), len(test_feat))) if constants.classifier == constants.Classifier.naive_bays: classifier = NaiveBayesClassifier.train(train_feat) analyzer.evaluate(test_feat, classifier, accuracy=True, f_measure=True, precision=True, recall=True, verbose=True) classifier.show_most_informative_features() # elif constants.classifier == constants.Classifier.maxent: # classifier = MaxentClassifier.train(train_feat) # analyzer.evaluate(test_feat, classifier, accuracy=True, f_measure=True, precision=True, recall=True, # verbose=True) # classifier.show_most_informative_features() elif constants.classifier == constants.Classifier.decision_tree: classifier = SklearnClassifier( DecisionTreeClassifier()).train(train_feat) analyzer.evaluate(test_feat, classifier, accuracy=True, f_measure=True, precision=True, recall=True, verbose=True) elif constants.classifier == constants.Classifier.linear_svm: classifier = SklearnClassifier(LinearSVC()).train(train_feat) analyzer.evaluate(test_feat, classifier, accuracy=True, f_measure=True, precision=True, recall=True, verbose=True) elif constants.classifier == constants.Classifier.random_forest: classifier = SklearnClassifier( RandomForestClassifier()).train(train_feat) analyzer.evaluate(test_feat, classifier, accuracy=True, f_measure=True, precision=True, recall=True, verbose=True) elif constants.classifier == constants.Classifier.logistic: classifier = SklearnClassifier(LogisticRegression()).train(train_feat) analyzer.evaluate(test_feat, classifier, accuracy=True, f_measure=True, precision=True, recall=True, verbose=True)
#print(len(X_test)) if not preprocess_flag: bow_transformer = joblib.load('FeatTransformer.pkl') X_train = joblib.load('TrainFeatures.pkl') X_test = joblib.load('TestFeatures.pkl') else: bow_transformer = CountVectorizer(analyzer=format_sentence).fit(X_train) X_train = bow_transformer.transform(X_train) X_test = bow_transformer.transform(X_test) joblib.dump(bow_transformer, 'FeatTransformer.pkl') joblib.dump(X_train, 'TrainFeatures.pkl') joblib.dump(X_test, 'TestFeatures.pkl') #train decision tree classifier dt_flag = 0 #if 1, train model from scratch and dump - if 0, load dumped model dt = DecisionTreeClassifier() if dt_flag: dt_clf = dt.fit(X_train, Y_train) joblib.dump(dt_clf, 'DTmodel.pkl') else: dt_clf = joblib.load('DTmodel.pkl') #test dt classifier preds = dt_clf.predict(X_test) cm = confusion_matrix(Y_test, preds) print(cm) print('\n') print(classification_report(Y_test, preds)) #plot_roc_curve(dt_clf,X_test,Y_test) plt.figure() plot_confusion_matrix(cm, classes=['negative', 'positive'], normalize=True, title='Normalized confusion matrix - Decision Tree') plt.show()