def create_classifier(iterations=100): """ Return the classifier that did the best at classifying a subset of the data after training for the given number of iterations :param iterations: number of iterations to test on :return: tuple: (classifier, accuracy of classifier) """ negids = reddit_politics.fileids("neg") posids = reddit_politics.fileids("pos") negfeats = [(word_feats(reddit_politics.words(fileids=[f]), FILTER_LIST), "neg") for f in negids] posfeats = [(word_feats(reddit_politics.words(fileids=[f]), FILTER_LIST), "pos") for f in posids] # track the most accurate classifier so far best_classifier = None highest_accuracy = 0 for iter_num in range(iterations): # randomly shuffle the feature sets to get new subsets to test and train on random.shuffle(negfeats) random.shuffle(posfeats) negcutoff = int(len(negfeats) * 3 / 4) poscutoff = int(len(posfeats) * 3 / 4) trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] # negfeats[:negcutoff] + posfeats[:poscutoff] testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] # negfeats[negcutoff:] + posfeats[poscutoff:] if DEBUG: print("Train on %d instances, test on %d instances.\n" % (len(trainfeats), len(testfeats))) # train the classifier on the training features and determine its accuracy classifier = NaiveBayesClassifier.train(trainfeats) accuracy = nltk.classify.util.accuracy(classifier, testfeats) if DEBUG: print("\nAccuracy:", accuracy) # if this classifier outperformed all before it, track it and its accuracy if accuracy > highest_accuracy: highest_accuracy = accuracy best_classifier = classifier utils.update_progress(iter_num / iterations, message="Testing Classifiers") sys.stdout.write("\n\n") return (classifier, highest_accuracy)
def word_feats(words, filter_list): return dict([(word, True) for word in words if word not in filter_list and len(word) > 2]) # get all of the file IDs associated with the positive and negative entries in each corpus reddit_negids = reddit_politics.fileids('neg') reddit_posids = reddit_politics.fileids('pos') movie_negids = movie_reviews.fileids('neg') movie_posids = movie_reviews.fileids('pos') # process the positive and negative features of each corpus reddit_negfeats = [(word_feats(reddit_politics.words(fileids=[f]), FILTER_LIST), 'neg') for f in reddit_negids] reddit_posfeats = [(word_feats(reddit_politics.words(fileids=[f]), FILTER_LIST), 'pos') for f in reddit_posids] movie_negfeats = [(word_feats(movie_reviews.words(fileids=[f]), FILTER_LIST), 'neg') for f in movie_negids] movie_posfeats = [(word_feats(movie_reviews.words(fileids=[f]), FILTER_LIST), 'pos') for f in movie_posids] # list of tuples containing the accuracies of each training corpus accuracies = [] for _ in range(ITERATIONS): # shuffle the reddit features each iteration so that a new subset of them is being used for testing random.shuffle(reddit_negfeats) random.shuffle(reddit_posfeats) # find the cutoff points in both the positive and negative feature sets to divide training and testing data negcutoff = int(len(reddit_negfeats) * 3 / 4) poscutoff = int(len(reddit_posfeats) * 3 / 4)