def test_authors(p, bag_of_words = True, alpha = 0.05, bag_of_char_ngrams = False, ngram_len = 5, set_of_words = False, complexity_features = False,
	print_predictions = True):
	""" Tests the classifiers with the given feature sets.

	p is the Preprocessor object holding the path data. It must have been initialized by
	calling p.organize_authors()

	If bag_of_words argument is True, the program uses Multinomial Naive Bayes classifier
	with the given alpha and bag of words as the feature set.

	If bag_of_char_ngrams argument is True, the program uses Multinomial Naive Bayes classifier
	with the given alpha, and a bag of character n-grams as the feature set.

	If set_of_words argument is True, the program uses Binarized Multinomial Naive Bayes clasifier
	with the given alpha, and a set of words as the feature set.

	If complexity_features argument is True, the program uses Normalizing Naive Bayes, which
	is my term for a classifier which simply fits all features for all classes into their own normal
	distributions and calculates probabilities using the pdfs.

	Returns a 4-tuple, each being the score tuple a different feature set, in the order they are written
	above. Any feature sets not used will return a score of None.
	"""
	authors = p.get_authors()
	classifiers = (None if not bag_of_words else MultinomialNaiveBayes(authors, alpha = alpha),
		None if not bag_of_char_ngrams else MultinomialNaiveBayes(authors, alpha = alpha),
		None if not set_of_words else BinarizedMultinomialNaiveBayes(authors, alpha = alpha),
		None if not complexity_features else NormalizingNaiveBayes(authors, 8))

	# Train the bayes classifiers for each training data
	for author in authors:
		for clsf in classifiers:
			if clsf is not None: clsf.add_documents(author, len(p.training_data(author)))

		for data in p.training_data(author):

			# Featurize and add features to the classifiers
			t = Tokenizer(p.file_path(author,data))
			if classifiers[0] is not None: classifiers[0].add_feature_counts(author, t.bag_of_words())
			if classifiers[1] is not None: classifiers[1].add_feature_counts(author, t.bag_of_char_ngrams(ngram_len))
			if classifiers[2] is not None: classifiers[2].add_feature_counts(author, t.bag_of_words())
			if classifiers[3] is not None: classifiers[3].add_features(author, classifiers[3].vectorize(t.features()))

	for clsf in classifiers:
		if clsf is not None: clsf.train()

	testers = (None if not bag_of_words else Tester(classifiers[0].get_classes()),
		None if not bag_of_char_ngrams else Tester(classifiers[1].get_classes()),
		None if not set_of_words else Tester(classifiers[2].get_classes()),
		None if not complexity_features else Tester(classifiers[3].get_classes()))

	# Check the classifier predictions for each test data
	for author in authors:
		for data in p.test_data(author):

			# Featurize and classify
			t = Tokenizer(p.file_path(author,data, training_data = False))
			class_predicted = [None, None, None, None]
			if classifiers[0] is not None:
				class_predicted[0] = classifiers[0].most_probable_class(classifiers[0].vectorize(t.bag_of_words()))
				testers[0].add_stat(class_predicted[0], author)
			if classifiers[1] is not None:
				class_predicted[1] = classifiers[1].most_probable_class(classifiers[1].vectorize(t.bag_of_char_ngrams(ngram_len)))
				testers[1].add_stat(class_predicted[1], author)
			if classifiers[2] is not None:
				class_predicted[2] = classifiers[2].most_probable_class(classifiers[2].vectorize(t.bag_of_words()))
				testers[2].add_stat(class_predicted[2], author)
			if classifiers[3] is not None:
				class_predicted[3] = classifiers[3].most_probable_class(classifiers[3].vectorize(t.features()))
				testers[3].add_stat(class_predicted[3], author)
			if print_predictions: print('predicted:',[pr for pr in class_predicted if pr is not None],'actual:',author)
		
	return (testers[0].scores() if testers[0] is not None else None, testers[1].scores() if testers[1] is not None else None,
		testers[2].scores() if testers[2] is not None else None, testers[3].scores() if testers[3] is not None else None)
Example #2
0
def test_authors(p,
                 bag_of_words=True,
                 alpha=0.05,
                 bag_of_char_ngrams=False,
                 ngram_len=5,
                 set_of_words=False,
                 complexity_features=False,
                 print_predictions=True):
    """ Tests the classifiers with the given feature sets.

	p is the Preprocessor object holding the path data. It must have been initialized by
	calling p.organize_authors()

	If bag_of_words argument is True, the program uses Multinomial Naive Bayes classifier
	with the given alpha and bag of words as the feature set.

	If bag_of_char_ngrams argument is True, the program uses Multinomial Naive Bayes classifier
	with the given alpha, and a bag of character n-grams as the feature set.

	If set_of_words argument is True, the program uses Binarized Multinomial Naive Bayes clasifier
	with the given alpha, and a set of words as the feature set.

	If complexity_features argument is True, the program uses Normalizing Naive Bayes, which
	is my term for a classifier which simply fits all features for all classes into their own normal
	distributions and calculates probabilities using the pdfs.

	Returns a 4-tuple, each being the score tuple a different feature set, in the order they are written
	above. Any feature sets not used will return a score of None.
	"""
    authors = p.get_authors()
    classifiers = (None if not bag_of_words else MultinomialNaiveBayes(
        authors, alpha=alpha),
                   None if not bag_of_char_ngrams else MultinomialNaiveBayes(
                       authors, alpha=alpha), None if not set_of_words else
                   BinarizedMultinomialNaiveBayes(authors, alpha=alpha),
                   None if not complexity_features else NormalizingNaiveBayes(
                       authors, 8))

    # Train the bayes classifiers for each training data
    for author in authors:
        for clsf in classifiers:
            if clsf is not None:
                clsf.add_documents(author, len(p.training_data(author)))

        for data in p.training_data(author):

            # Featurize and add features to the classifiers
            t = Tokenizer(p.file_path(author, data))
            if classifiers[0] is not None:
                classifiers[0].add_feature_counts(author, t.bag_of_words())
            if classifiers[1] is not None:
                classifiers[1].add_feature_counts(
                    author, t.bag_of_char_ngrams(ngram_len))
            if classifiers[2] is not None:
                classifiers[2].add_feature_counts(author, t.bag_of_words())
            if classifiers[3] is not None:
                classifiers[3].add_features(
                    author, classifiers[3].vectorize(t.features()))

    for clsf in classifiers:
        if clsf is not None: clsf.train()

    testers = (
        None if not bag_of_words else Tester(classifiers[0].get_classes()),
        None
        if not bag_of_char_ngrams else Tester(classifiers[1].get_classes()),
        None if not set_of_words else Tester(classifiers[2].get_classes()),
        None
        if not complexity_features else Tester(classifiers[3].get_classes()))

    # Check the classifier predictions for each test data
    for author in authors:
        for data in p.test_data(author):

            # Featurize and classify
            t = Tokenizer(p.file_path(author, data, training_data=False))
            class_predicted = [None, None, None, None]
            if classifiers[0] is not None:
                class_predicted[0] = classifiers[0].most_probable_class(
                    classifiers[0].vectorize(t.bag_of_words()))
                testers[0].add_stat(class_predicted[0], author)
            if classifiers[1] is not None:
                class_predicted[1] = classifiers[1].most_probable_class(
                    classifiers[1].vectorize(t.bag_of_char_ngrams(ngram_len)))
                testers[1].add_stat(class_predicted[1], author)
            if classifiers[2] is not None:
                class_predicted[2] = classifiers[2].most_probable_class(
                    classifiers[2].vectorize(t.bag_of_words()))
                testers[2].add_stat(class_predicted[2], author)
            if classifiers[3] is not None:
                class_predicted[3] = classifiers[3].most_probable_class(
                    classifiers[3].vectorize(t.features()))
                testers[3].add_stat(class_predicted[3], author)
            if print_predictions:
                print('predicted:',
                      [pr for pr in class_predicted if pr is not None],
                      'actual:', author)

    return (testers[0].scores() if testers[0] is not None else None,
            testers[1].scores() if testers[1] is not None else None,
            testers[2].scores() if testers[2] is not None else None,
            testers[3].scores() if testers[3] is not None else None)