def high_information(feats, categories): print("\n##### Obtaining high information words...") labelled_words = [(category, []) for category in categories] #1. convert the formatting of our features to that required by high_information_words from collections import defaultdict words = defaultdict(list) all_words = list() for category in categories: words[category] = list() for feat in feats: category = feat[1] bag = feat[0] for w in bag.keys(): words[category].append(w) all_words.append(w) # break labelled_words = [(category, words[category]) for category in categories] #print labelled_words #2. calculate high information words high_info_words = set(high_information_words(labelled_words)) #print(high_info_words) #high_info_words contains a list of high-information words. You may want to use only these for classification. # You can restrict the words in a bag of words to be in a given 2nd list (e.g. in function read_files) # e.g. bag_of_words_in_set(words, high_info_words) print(" Number of words in the data: %i" % len(all_words)) print(" Number of distinct words in the data: %i" % len(set(all_words))) print(" Number of distinct 'high-information' words in the data: %i" % len(high_info_words)) return high_info_words
def high_information(feats, categories): print("\n##### Obtaining high information words...") labelled_words = [(category, []) for category in categories] from collections import defaultdict words = defaultdict(list) all_words = list() for category in categories: words[category] = list() for feat in feats: category = feat[1] bag = feat[0] for w in bag.keys(): words[category].append(w) all_words.append(w) labelled_words = [(category, words[category]) for category in categories] high_info_words = set(high_information_words(labelled_words, min_score=12)) print(" Number of words in the data: %i" % len(all_words)) print(" Number of distinct words in the data: %i" % len(set(all_words))) print(" Number of distinct 'high-information' words in the data: %i" % len(high_info_words)) return high_info_words
def high_information(feats, categories): """ Returns a list with high information words based on a feature set. """ print("\n##### Obtaining high information words...") # 1. convert the formatting of our features to that required by high_information_words from collections import defaultdict words = defaultdict(list) all_words = list() for category in categories: words[category] = list() for feat in feats: category = feat[1] bag = feat[0] for w in bag.keys(): words[category].append(w) all_words.append(w) labelled_words = [(category, words[category]) for category in categories] # 2. calculate high information words high_info_words = set(high_information_words(labelled_words, min_score=2)) print(" Number of words in the data: %i" % len(all_words)) print(" Number of distinct words in the data: %i" % len(set(all_words))) print(" Number of distinct 'high-information' words in the data: %i" % len(high_info_words)) return high_info_words
def high_information(feats, categories): labelled_words = [(category, []) for category in categories] #1. convert the formatting of our features to that required by high_information_words from collections import defaultdict words = defaultdict(list) all_words = list() for category in categories: words[category] = list() for feat in feats: category = feat[1] bag = feat[0] for w in bag.keys(): words[category].append(w) all_words.append(w) # break labelled_words = [(category, words[category]) for category in categories] #print labelled_words #calculate high information words high_info_words = set(high_information_words(labelled_words)) return high_info_words
def high_information(feats, categories): from collections import defaultdict words = defaultdict(list) all_words = list() for category in categories: words[category] = list() for feat in feats: category = feat[1] bag = feat[0] for w in bag: words[category].append(w) all_words.append(w) labelled_words = [(category, words[category]) for category in categories] high_info_words = set(high_information_words(labelled_words, min_score=5)) return high_info_words
# Referidos al capitulo 7. ############################################################################# from featx import label_feats_from_corpus, split_label_feats, high_information_words, bag_of_words_in_set from classification import precision_recall, MaxVoteClassifier # classification.py debe estar en el mismo dir. from nltk.corpus import movie_reviews from nltk.classify.util import accuracy from nltk.classify import NaiveBayesClassifier from nltk.classify import MaxentClassifier from nltk.classify import DecisionTreeClassifier from nltk.classify.scikitlearn import SklearnClassifier from sklearn.svm import LinearSVC labels = movie_reviews.categories() labeled_words = [(l, movie_reviews.words(categories=[l])) for l in labels] high_info_words = set(high_information_words(labeled_words)) feat_det = lambda words: bag_of_words_in_set(words, high_info_words) lfeats = label_feats_from_corpus(movie_reviews, feature_detector=feat_det) train_feats, test_feats = split_label_feats(lfeats) print("######################################################################") nb_classifier = NaiveBayesClassifier.train(train_feats) print("Accuracy Naive Bayes: " + str(accuracy(nb_classifier, test_feats))) # Accuracy: 0.91 nb_precisions, nb_recalls = precision_recall(nb_classifier, test_feats) print("Precisions Naive Bayes Pos: " + str(nb_precisions['pos'])) # Precisions Pos: 0.8988326848249028 print("Precisions Naive Bayes Neg: " + str(nb_precisions['neg'])) # Precisions Neg: 0.9218106995884774 print("Recalls Naive Bayes Pos: " + str(nb_recalls['pos'])) # Recalls Pos: 0.924