Ejemplo n.º 1
0
def create_classifier(iterations=100):
    """
    Return the classifier that did the best at classifying a subset of the data
    after training for the given number of iterations

    :param iterations: number of iterations to test on
    :return:    tuple: (classifier, accuracy of classifier) 
    """
    negids = reddit_politics.fileids("neg")
    posids = reddit_politics.fileids("pos")

    negfeats = [(word_feats(reddit_politics.words(fileids=[f]), FILTER_LIST), "neg") for f in negids]
    posfeats = [(word_feats(reddit_politics.words(fileids=[f]), FILTER_LIST), "pos") for f in posids]

    # track the most accurate classifier so far
    best_classifier = None
    highest_accuracy = 0
    for iter_num in range(iterations):
        # randomly shuffle the feature sets to get new subsets to test and train on
        random.shuffle(negfeats)
        random.shuffle(posfeats)

        negcutoff = int(len(negfeats) * 3 / 4)
        poscutoff = int(len(posfeats) * 3 / 4)

        trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]  # negfeats[:negcutoff] + posfeats[:poscutoff]
        testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]  # negfeats[negcutoff:] + posfeats[poscutoff:]

        if DEBUG:
            print("Train on %d instances, test on %d instances.\n" % (len(trainfeats), len(testfeats)))

        # train the classifier on the training features and determine its accuracy
        classifier = NaiveBayesClassifier.train(trainfeats)
        accuracy = nltk.classify.util.accuracy(classifier, testfeats)

        if DEBUG:
            print("\nAccuracy:", accuracy)

        # if this classifier outperformed all before it, track it and its accuracy
        if accuracy > highest_accuracy:
            highest_accuracy = accuracy
            best_classifier = classifier
        utils.update_progress(iter_num / iterations, message="Testing Classifiers")
    sys.stdout.write("\n\n")
    return (classifier, highest_accuracy)
Ejemplo n.º 2
0


def word_feats(words, filter_list):
    return dict([(word, True) for word in words if word not in filter_list and len(word) > 2])



# get all of the file IDs associated with the positive and negative entries in each corpus
reddit_negids = reddit_politics.fileids('neg')
reddit_posids = reddit_politics.fileids('pos')
movie_negids = movie_reviews.fileids('neg')
movie_posids = movie_reviews.fileids('pos')
 
# process the positive and negative features of each corpus
reddit_negfeats = [(word_feats(reddit_politics.words(fileids=[f]), FILTER_LIST), 'neg') for f in reddit_negids]
reddit_posfeats = [(word_feats(reddit_politics.words(fileids=[f]), FILTER_LIST), 'pos') for f in reddit_posids]
movie_negfeats = [(word_feats(movie_reviews.words(fileids=[f]), FILTER_LIST), 'neg') for f in movie_negids]
movie_posfeats = [(word_feats(movie_reviews.words(fileids=[f]), FILTER_LIST), 'pos') for f in movie_posids]

# list of tuples containing the accuracies of each training corpus
accuracies = []
for _ in range(ITERATIONS):
    # shuffle the reddit features each iteration so that a new subset of them is being used for testing
    random.shuffle(reddit_negfeats)
    random.shuffle(reddit_posfeats)

    # find the cutoff points in both the positive and negative feature sets to divide training and testing data
    negcutoff = int(len(reddit_negfeats) * 3 / 4)
    poscutoff = int(len(reddit_posfeats) * 3 / 4)