Beispiel #1
0
def high_information(feats, categories):
	print("\n##### Obtaining high information words...")

	labelled_words = [(category, []) for category in categories]

	#1. convert the formatting of our features to that required by high_information_words
	from collections import defaultdict
	words = defaultdict(list)
	all_words = list()
	for category in categories:
		words[category] = list()

	for feat in feats:
		category = feat[1]
		bag = feat[0]
		for w in bag.keys():
			words[category].append(w)
			all_words.append(w)
#		break

	labelled_words = [(category, words[category]) for category in categories]
	#print labelled_words

	#2. calculate high information words
	high_info_words = set(high_information_words(labelled_words))
	#print(high_info_words)
	#high_info_words contains a list of high-information words. You may want to use only these for classification.
	# You can restrict the words in a bag of words to be in a given 2nd list (e.g. in function read_files)
	# e.g. bag_of_words_in_set(words, high_info_words)

	print("  Number of words in the data: %i" % len(all_words))
	print("  Number of distinct words in the data: %i" % len(set(all_words)))
	print("  Number of distinct 'high-information' words in the data: %i" % len(high_info_words))

	return high_info_words
def high_information(feats, categories):
	print("\n##### Obtaining high information words...")

	labelled_words = [(category, []) for category in categories]

	from collections import defaultdict
	words = defaultdict(list)
	all_words = list()
	for category in categories:
		words[category] = list()

	for feat in feats:
		category = feat[1]
		bag = feat[0]
		for w in bag.keys():
			words[category].append(w)
			all_words.append(w)

	labelled_words = [(category, words[category]) for category in categories]
	high_info_words = set(high_information_words(labelled_words, min_score=12))

	print("  Number of words in the data: %i" % len(all_words))
	print("  Number of distinct words in the data: %i" % len(set(all_words)))
	print("  Number of distinct 'high-information' words in the data: %i" % len(high_info_words))

	return high_info_words
def high_information(feats, categories):
	""" Returns a list with high information words based on a feature set. """
	print("\n##### Obtaining high information words...")

	# 1. convert the formatting of our features to that required by high_information_words
	from collections import defaultdict
	words = defaultdict(list)
	all_words = list()
	for category in categories:
		words[category] = list()

	for feat in feats:
		category = feat[1]
		bag = feat[0]
		for w in bag.keys():
			words[category].append(w)
			all_words.append(w)

	labelled_words = [(category, words[category]) for category in categories]

	# 2. calculate high information words
	high_info_words = set(high_information_words(labelled_words, min_score=2))

	print("  Number of words in the data: %i" % len(all_words))
	print("  Number of distinct words in the data: %i" % len(set(all_words)))
	print("  Number of distinct 'high-information' words in the data: %i" % len(high_info_words))

	return high_info_words
Beispiel #4
0
def high_information(feats, categories):

    labelled_words = [(category, []) for category in categories]

    #1. convert the formatting of our features to that required by high_information_words
    from collections import defaultdict
    words = defaultdict(list)
    all_words = list()
    for category in categories:
        words[category] = list()

    for feat in feats:
        category = feat[1]
        bag = feat[0]
        for w in bag.keys():
            words[category].append(w)
            all_words.append(w)


#		break

    labelled_words = [(category, words[category]) for category in categories]
    #print labelled_words

    #calculate high information words
    high_info_words = set(high_information_words(labelled_words))

    return high_info_words
def high_information(feats, categories):
    from collections import defaultdict
    words = defaultdict(list)
    all_words = list()
    for category in categories:
        words[category] = list()

    for feat in feats:
        category = feat[1]
        bag = feat[0]
        for w in bag:
            words[category].append(w)
            all_words.append(w)

    labelled_words = [(category, words[category]) for category in categories]
    high_info_words = set(high_information_words(labelled_words, min_score=5))
    return high_info_words
# Referidos al capitulo 7.
#############################################################################

from featx import label_feats_from_corpus, split_label_feats, high_information_words, bag_of_words_in_set
from classification import precision_recall, MaxVoteClassifier    # classification.py debe estar en el mismo dir.
from nltk.corpus import movie_reviews
from nltk.classify.util import accuracy
from nltk.classify import NaiveBayesClassifier
from nltk.classify import MaxentClassifier
from nltk.classify import DecisionTreeClassifier
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import LinearSVC

labels = movie_reviews.categories()
labeled_words = [(l, movie_reviews.words(categories=[l])) for l in labels]
high_info_words = set(high_information_words(labeled_words))
feat_det = lambda words: bag_of_words_in_set(words, high_info_words)
lfeats = label_feats_from_corpus(movie_reviews, feature_detector=feat_det)
train_feats, test_feats = split_label_feats(lfeats)

print("######################################################################")
nb_classifier = NaiveBayesClassifier.train(train_feats)
print("Accuracy Naive Bayes: " + str(accuracy(nb_classifier, test_feats)))
# Accuracy: 0.91
nb_precisions, nb_recalls = precision_recall(nb_classifier, test_feats)
print("Precisions Naive Bayes Pos: " + str(nb_precisions['pos']))
# Precisions Pos: 0.8988326848249028
print("Precisions Naive Bayes Neg: " + str(nb_precisions['neg']))
# Precisions Neg: 0.9218106995884774
print("Recalls Naive Bayes Pos: " + str(nb_recalls['pos']))
# Recalls Pos: 0.924