def setup(): global bestwords word_fd = FreqDist() label_word_fd = ConditionalFreqDist() for word in movie_reviews.words(categories=['pos']): word_fd.inc(word.strip('\'"?,.').lower()) label_word_fd['pos'].inc(word.lower()) for word in movie_reviews.words(categories=['neg']): word_fd.inc(word.strip('\'"?,.').lower()) label_word_fd['neg'].inc(word.lower()) pos_word_count = label_word_fd['pos'].N() neg_word_count = label_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:10000] bestwords = set([w for w, s in best]) return train(best_bigram_word_features)
def load_data(): global posfeats,negfeats negids = movie_reviews.fileids('neg') posids = movie_reviews.fileids('pos') negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids] posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids] return
def GetHighInformationWordsChi(num_bestwords): word_fd = FreqDist() label_word_fd = ConditionalFreqDist() for word in movie_reviews.words(categories=['pos']): word_fd[word.lower()] +=1 label_word_fd['pos'][word.lower()] +=1 for word in movie_reviews.words(categories=['neg']): word_fd[word.lower()] +=1 label_word_fd['neg'][word.lower()] +=1 pos_word_count = label_word_fd['pos'].N() neg_word_count = label_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:num_bestwords] bestwords = set([w for w, s in best]) return bestwords
def maketrainset(movie_reviews, tokenizer, stemmer): negids = movie_reviews.fileids('neg') posids = movie_reviews.fileids('pos') negfeats = [(tokenizer(movie_reviews.words(fileids=[f]), stemmer), 'neg') for f in negids] posfeats = [(tokenizer(movie_reviews.words(fileids=[f]), stemmer), 'pos') for f in posids] trainfeats = negfeats + posfeats return trainfeats
def main(): negids = movie_reviews.fileids('neg') posids = movie_reviews.fileids('pos') negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids] posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids] negcutoff = int(len(negfeats) * 3 / 4) poscutoff = int(len(posfeats) * 3 / 4) trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] classifier = NaiveBayesClassifier.train(trainfeats) with open("output.json") as fin: sid = SentimentIntensityAnalyzer() data = json.load(fin) for key in data: reviews = data[key]["reviews"] for i in range(len(reviews)): text = reviews[i]["review"] sentiment_dict = {'positive_probability':0, 'label':'', 'negative_probability':0} prob = classifier.prob_classify(word_feats(text.split(" "))) classification = classifier.classify(word_feats(text.split(" "))) sentiment_dict['positive_probability'] = prob.prob('pos') sentiment_dict['negative_probability'] = prob.prob('neg') sentiment_dict['label'] = classification reviews[i]["sentiment"] = sentiment_dict data[key]["reviews"] = reviews with open('out_with_sentiment.json', 'w') as outfile: json.dump(data, outfile)
def __init__(self): ## Best words feature extraction word_fd = FreqDist() label_word_fd = ConditionalFreqDist() for word in movie_reviews.words(categories=['pos']): word_fd.inc(word.lower()) label_word_fd['pos'].inc(word.lower()) for word in movie_reviews.words(categories=['neg']): word_fd.inc(word.lower()) label_word_fd['neg'].inc(word.lower()) pos_word_count = label_word_fd['pos'].N() neg_word_count = label_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:10000] self.bestwords = set([w for w, s in best]) self.train_classifier()
def prepareSentimentClassifier(): documents = [(list(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] random.shuffle(documents) all_words = [] for w in movie_reviews.words(): all_words.append(w.lower()) all_words = nltk.FreqDist(all_words) global word_featuresSent word_featuresSent = list(all_words.keys())[:3000] featuresets = [(findFeaturesSentiment(rev), category) for (rev, category) in documents] training_set = featuresets[:1900] testing_set = featuresets[1900:] sentimentClassifier = nltk.NaiveBayesClassifier.train(training_set) print("Classifier accuracy percent:",(nltk.classify.accuracy(sentimentClassifier, testing_set))*100) return sentimentClassifier
def best_word_feats(self, words): word_fd = FreqDist() label_word_fd = ConditionalFreqDist() for word in movie_reviews.words(categories=['pos']): word_fd.inc(word.lower()) label_word_fd['pos'].inc(word.lower()) for word in movie_reviews.words(categories=['neg']): word_fd.inc(word.lower()) label_word_fd['neg'].inc(word.lower()) # n_ii = label_word_fd[label][word] # n_ix = word_fd[word] # n_xi = label_word_fd[label].N() # n_xx = label_word_fd.N() pos_word_count = label_word_fd['pos'].N() neg_word_count = label_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:10000] bestwords = set([w for w, s in best]) return dict([(word, True) for word in words if word in bestwords])
def train_with_movie_db(self): """ Training possible with movie reviews - this does not yield particularly good results """ self.use_movie_reviews = True negids = movie_reviews.fileids('neg') posids = movie_reviews.fileids('pos') negfeats = [(self.feature_extraction_movie_reviews(movie_reviews.words(fileids=[f])), "negative") for f in negids] posfeats = [(self.feature_extraction_movie_reviews(movie_reviews.words(fileids=[f])), "positive") for f in posids] negcutoff = len(negfeats) * 3 / 4 poscutoff = len(posfeats) * 3 / 4 trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] DLOG("train on %d instances, test on %d instances" % (len(trainfeats), len(testfeats))) self.classifier = NaiveBayesClassifier.train(trainfeats) DLOG("accuracy: " + str(util.accuracy(self.classifier, testfeats))) DLOG(self.classifier.show_most_informative_features())
def train(test=False): negids = movie_reviews.fileids('neg') posids = movie_reviews.fileids('pos') negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids] posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids] if(test): negcutoff = len(negfeats)*3/4 poscutoff = len(posfeats)*3/4 trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] print 'train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats)) classifier = NaiveBayesClassifier.train(trainfeats) print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats) classifier.show_most_informative_features() else: return NaiveBayesClassifier.train(negfeats+posfeats)
def documentClassification(): from nltk.corpus import movie_reviews documents = [(list(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] random.shuffle(documents) all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words()) word_features = all_words.keys()[:2000] def document_features(document): document_words = set(document) features = {} for word in word_features: features['contains(%s)' % word] = (word in document_words) return features print document_features(movie_reviews.words('pos/cv957_8737.txt')) featuresets = [(document_features(d), c) for (d,c) in documents] train_set, test_set = featuresets[100:], featuresets[:100] classifier = nltk.NaiveBayesClassifier.train(train_set) print nltk.classify.accuracy(classifier, test_set) classifier.show_most_informative_features(5)
def category_by_movie(): from nltk.corpus import movie_reviews as mr from nltk import FreqDist from nltk import NaiveBayesClassifier from nltk import classify from nltk.corpus import names from nltk.classify import apply_features import random documents = [(list(mr.words(f)), c) for c in mr.categories() for f in mr.fileids(c)] random.shuffle(documents) all_words = FreqDist(w.lower() for w in mr.words()) word_features = all_words.keys()[:2000] def document_features(document): document_words = set(document) features = {} for word in word_features: features['contains(%s)' % word] = (word in document_words) return features #print document_features(mr.words('pos/cv957_8737.txt')) #print documents[0] features = [(document_features(d), c) for (d, c) in documents] train_set, test_set = features[100:], features[:100] classifier = NaiveBayesClassifier.train(train_set) print classify.accuracy(classifier, train_set)
def evaluate_classifier(featx): negids = movie_reviews.fileids('neg') posids = movie_reviews.fileids('pos') negfeats = [(featx(movie_reviews.words(fileids=[f])), 'neg') for f in negids] posfeats = [(featx(movie_reviews.words(fileids=[f])), 'pos') for f in posids] negcutoff = len(negfeats)*3/4 poscutoff = len(posfeats)*3/4 trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] classifier = NaiveBayesClassifier.train(trainfeats) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, label) in enumerate(testfeats): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats) print 'pos precision:', nltk.metrics.precision(refsets['pos'], testsets['pos']) print 'pos recall:', nltk.metrics.recall(refsets['pos'], testsets['pos']) print 'neg precision:', nltk.metrics.precision(refsets['neg'], testsets['neg']) print 'neg recall:', nltk.metrics.recall(refsets['neg'], testsets['neg']) classifier.show_most_informative_features()
def main(argv): negids = movie_reviews.fileids('neg') posids = movie_reviews.fileids('pos') #print negids negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'negative') for f in negids] posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'positive') for f in posids] trainfeats = posfeats+negfeats #print trainfeats # break classifier = NaiveBayesClassifier.train(trainfeats) #classifier = pickle.load(open("classifier.p", "rb")) topicList = ["media", "sports", "news", "fashion", "finance", "politics"] for line in sys.stdin: try: tolk_posset = word_tokenize(line.rstrip()) d = word_feats(tolk_posset) for topic in topicList: subjectFull = subj(line, topic) if not subjectFull == "No match": #print d print "LongValueSum:" + "" + str(line.split(":")[0])+","+subjectFull + "," + classifier.classify(d) + "\t" + "1" except: #print "Error" continue
def __init__(self): self.documents = [(list(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] random.shuffle(self.documents) all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words()) word_features = all_words.keys()[:2000]
def __init__(self, train1=True, train2=True, train3=True, train4=True): self.trainfeats = [] if train1: negids = movie_reviews.fileids('neg') posids = movie_reviews.fileids('pos') neg_movies = [(self.word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids] pos_movies = [(self.word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids] self.trainfeats = neg_movies + pos_movies if train2: f = open("out.txt", "r") negfeats = [] posfeats = [] for line in f: status = line[0] texto = line[2:] if status == '0': negfeats.append((self.word_feats(texto.split(" ")), 'neg')) elif status == '1': posfeats.append((self.word_feats(texto.split(" ")), 'pos')) self.trainfeats += negfeats + posfeats if train3: f = open("E:\\Workspace\\WS_TG\\analisador1\\AFINN\\AFINN-111.txt", 'r') for l in f: data = l.strip().split('\t') self.trainfeats.append( (self.word_feats(data[0]), 'neg' if int(data[1]) < 0 else 'pos')) if train4: f = open("E:\\Workspace\\WS_TG\\api\\trainning set.txt", 'r') pos = [] neutral = [] neg = [] for line in f: if line.startswith("pos"): pos.append(line) elif line.startswith("neutral"): neutral.append(line) elif line.startswith("neg"): neg.append(line) print len(pos), len(neutral), len(neg) total = pos + neutral[:200] + neg for line in total: data = line.split(' .:. ') self.trainfeats.append( (self.word_feats(data[1].split()), data[0]) ) self.classifier = NaiveBayesClassifier.train(self.trainfeats) print self.classifier.show_most_informative_features(20)
def build_classifier(self): documents = [(' '.join(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] random.shuffle(documents) all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words() if len(w) > 2) self.word_features = all_words.keys()[:2000] featuresets = [(self.document_features(d), c) for (d,c) in documents] classifier = nltk.NaiveBayesClassifier.train(featuresets) return classifier
def __init__(self, load = False, loadFile = ""): if(load): self.loadClassifier(loadFile) else: negids = movie_reviews.fileids('neg') posids = movie_reviews.fileids('pos') negfeats = [(self.word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in movie_reviews.fileids('neg')] posfeats = [(self.word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in movie_reviews.fileids('pos')] trainfeats = negfeats + posfeats self.classifier = NaiveBayesClassifier.train(trainfeats)
def train(feature): negids = movie_reviews.fileids('neg') posids = movie_reviews.fileids('pos') negfeatures = [(feature(movie_reviews.words(fileids=[f])), 'neg') for f in negids] posfeatures = [(feature(movie_reviews.words(fileids=[f])), 'pos') for f in posids] trainfeatures = negfeatures + posfeatures classifier = NaiveBayesClassifier.train(trainfeatures) return classifier
def train_classifiers(self): negids = movie_reviews.fileids('neg') posids = movie_reviews.fileids('pos') negfeats = [(word_feats( movie_reviews.words(fileids=[f])), 'neg') for f in negids] posfeats = [(word_feats( movie_reviews.words(fileids=[f])), 'pos') for f in posids] trainfeats = negfeats + posfeats # train naive bayes self.classifier = NaiveBayesClassifier.train(trainfeats)
def classify_document(): from nltk.corpus import movie_reviews import random documents = [(list(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] random.shuffle(documents) all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words()) allwords = [w for w, _ in all_words.most_common(2000)] featuresets = [(document_features(d, allwords), c) for d, c in documents] return classify(nltk.NaiveBayesClassifier, featuresets, 0.1)
def train(self, feats): print "Starting to train the data" start = datetime.datetime.now() print "setting the ids", datetime.datetime.now() self.negids = movie_reviews.fileids('neg') self.posids = movie_reviews.fileids('pos') #random.shuffle(self.negids) #random.shuffle(self.posids) ##self.reviews = ([(movie_reviews.words(fileids=[f]), 'neg') for f in self.negids] + ##[(movie_reviews.words(fileids=[f]), 'pos') for f in self.posids]) ##random.shuffle(self.reviews) ##self.train_set = apply_features(feats, self.reviews[len(self.reviews)*1/4:]) ##self.test_set = apply_features(feats, self.reviews[:len(self.reviews)*1/4]) print "setting the feats", datetime.datetime.now() self.negfeats = [(feats(movie_reviews.words(fileids=[f])), 'neg') for f in self.negids] self.posfeats = [(feats(movie_reviews.words(fileids=[f])), 'pos') for f in self.posids] self.negcutoff = len(self.negfeats)*3/4 self.poscutoff = len(self.posfeats)*3/4 print "setting the train/test", datetime.datetime.now() self.trainfeats = self.negfeats[:self.negcutoff] + self.posfeats[:self.poscutoff] self.testfeats = self.negfeats[self.negcutoff:] + self.posfeats[self.poscutoff:] print "training", datetime.datetime.now() self.classifier = NaiveBayesClassifier.train(self.trainfeats) ##self.classifier = NaiveBayesClassifier.train(self.train_set) self.refsets = defaultdict(set) self.testsets = defaultdict(set) print "accuracy stuff", datetime.datetime.now() for i, (feats, label) in enumerate(self.testfeats): ##for i, (feats, label) in enumerate(self.test_set): self.refsets[label].add(i) observed = self.classifier.classify(feats) self.testsets[observed].add(i) end = datetime.datetime.now() print "Training lasted for ", end-start print 'accuracy:', nltk.classify.util.accuracy(self.classifier, self.testfeats) ##print 'accuracy:', nltk.classify.util.accuracy(self.classifier, self.test_set) print 'pos precision:', nltk.metrics.precision(self.refsets['pos'], self.testsets['pos']) print 'pos recall:', nltk.metrics.recall(self.refsets['pos'], self.testsets['pos']) print 'neg precision:', nltk.metrics.precision(self.refsets['neg'], self.testsets['neg']) print 'neg recall:', nltk.metrics.recall(self.refsets['neg'], self.testsets['neg']) self.classifier.show_most_informative_features() self.trained = True
def demo_movie_reviews(trainer, n_instances=None, output=None): """ Train classifier on all instances of the Movie Reviews dataset. The corpus has been preprocessed using the default sentence tokenizer and WordPunctTokenizer. Features are composed of: - most frequent unigrams :param trainer: `train` method of a classifier. :param n_instances: the number of total reviews that have to be used for training and testing. Reviews will be equally split between positive and negative. :param output: the output file where results have to be reported. """ from nltk.corpus import movie_reviews from nltk.sentiment import SentimentAnalyzer if n_instances is not None: n_instances = int(n_instances/2) pos_docs = [(list(movie_reviews.words(pos_id)), 'pos') for pos_id in movie_reviews.fileids('pos')[:n_instances]] neg_docs = [(list(movie_reviews.words(neg_id)), 'neg') for neg_id in movie_reviews.fileids('neg')[:n_instances]] # We separately split positive and negative instances to keep a balanced # uniform class distribution in both train and test sets. train_pos_docs, test_pos_docs = split_train_test(pos_docs) train_neg_docs, test_neg_docs = split_train_test(neg_docs) training_docs = train_pos_docs+train_neg_docs testing_docs = test_pos_docs+test_neg_docs sentim_analyzer = SentimentAnalyzer() all_words = sentim_analyzer.all_words(training_docs) # Add simple unigram word features unigram_feats = sentim_analyzer.unigram_word_feats(all_words, min_freq=4) sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats) # Apply features to obtain a feature-value representation of our datasets training_set = sentim_analyzer.apply_features(training_docs) test_set = sentim_analyzer.apply_features(testing_docs) classifier = sentim_analyzer.train(trainer, training_set) try: classifier.show_most_informative_features() except AttributeError: print('Your classifier does not provide a show_most_informative_features() method.') results = sentim_analyzer.evaluate(test_set) if output: extr = [f.__name__ for f in sentim_analyzer.feat_extractors] output_markdown(output, Dataset='Movie_reviews', Classifier=type(classifier).__name__, Tokenizer='WordPunctTokenizer', Feats=extr, Results=results, Instances=n_instances)
def evaluate_classifier(featx): negids = movie_reviews.fileids('neg') posids = movie_reviews.fileids('pos') negfeats = [(featx(movie_reviews.words(fileids=[f])), 'neg') for f in negids] posfeats = [(featx(movie_reviews.words(fileids=[f])), 'pos') for f in posids] negcutoff = len(negfeats)*3/4 poscutoff = len(posfeats)*3/4 trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] classifier = NaiveBayesClassifier.train(trainfeats) return classifier
def train(): global classifier # Train our classifier negids = movie_reviews.fileids('neg') posids = movie_reviews.fileids('pos') negfeats = [(feature_extractor(movie_reviews.words(fileids=[f])), 'neg') for f in negids] posfeats = [(feature_extractor(movie_reviews.words(fileids=[f])), 'pos') for f in posids] classifier = NaiveBayesClassifier.train(negfeats + posfeats)
def train_classifier(self): """This code is heavily inspired by: http://streamhacker.com/2010/05/10/text-classification-sentiment-analysis-naive-bayes-classifier/ """ negids = movie_reviews.fileids('neg') posids = movie_reviews.fileids('pos') negfeats = [(self.word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids] posfeats = [(self.word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids] trainfeats = negfeats + posfeats self.classifier = NaiveBayesClassifier.train(trainfeats)
def main(): negids = movie_reviews.fileids('neg') posids = movie_reviews.fileids('pos') to_review1 = "A man with a magnanimous spirit helps a mute girl from Pakistan return home." to_review2 = "Forced out of his own company by former Darren Cross, Dr. Hank Pym (Michael Douglas) recruits the talents of Scott Lang (Paul Rudd), a master thief just released from prison. Lang becomes Ant-Man, trained by Pym and armed with a suit that allows him to shrink in size, possess superhuman strength and control an army of ants. The miniature hero must use his new skills to prevent Cross, also known as Yellowjacket, from perfecting the same technology and using it as a weapon for evil." to_review3 = '''Parents need to know that kids may clamor to see this fast-paced, action-packed comic book-based adventure. But it's definitely more age-appropriate for teens than younger children. Although much of the violence is clearly meant to be based in the realm of sci-fi and fantasy -- and/or is shown at a distance -- there's plenty of it, from massive explosions to children held at gunpoint to super-powered fistfights. Some of the violence is war themed, and some characters get hurt and/or die. While much is made of lead character Tony Stark's devil-may-care lifestyle of fun and frolic, viewers also see him turn away from the more irresponsible aspects of playboyhood. Language is minimal, and sexual content is more suggested than shown overall -- though there are a few eyebrow-raising moments.''' reviews = [] reviews.append(to_review1) reviews.append(to_review2) reviews.append(to_review3) for to_review in reviews: to_review_words = to_review.split(" ") print "Reviewing",to_review,"\n\n\n" print ''' Normal classification ''',"\n\n" negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids] posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids] calculateScore(classification(negfeats, posfeats, 1, 1), to_review_words) calculateScore(classification(negfeats, posfeats, 1, 0.95), to_review_words) calculateScore(classification(negfeats, posfeats, 0.95, 1), to_review_words) calculateScore(classification(negfeats, posfeats, 0.9, 1), to_review_words) calculateScore(classification(negfeats, posfeats, 1, 0.9), to_review_words) print ''' Without Punctuations ''',"\n\n" negfeats_stopwords = [(word_feats_punctuations(movie_reviews.words(fileids=[f])), 'neg') for f in negids] posfeats_stopwords = [(word_feats_punctuations(movie_reviews.words(fileids=[f])), 'pos') for f in posids] calculateScore_punctuations(classification(negfeats, posfeats, 1, 1), to_review_words) calculateScore_punctuations(classification(negfeats, posfeats, 1, 0.95), to_review_words) calculateScore_punctuations(classification(negfeats, posfeats, 0.95, 1), to_review_words) calculateScore_punctuations(classification(negfeats, posfeats, 0.9, 1), to_review_words) calculateScore_punctuations(classification(negfeats, posfeats, 1, 0.9), to_review_words) print ''' Without Stop Words ''',"\n\n" negfeats_stopwords = [(word_feats_stopwords(movie_reviews.words(fileids=[f])), 'neg') for f in negids] posfeats_stopwords = [(word_feats_stopwords(movie_reviews.words(fileids=[f])), 'pos') for f in posids] wordstoreview = [] for each in to_review_words: if each not in stopwords.words('english'): wordstoreview.append(each) calculateScore_stopwords(classification(negfeats, posfeats, 1, 1), wordstoreview) calculateScore_stopwords(classification(negfeats, posfeats, 1, 0.95), to_review_words) calculateScore_stopwords(classification(negfeats, posfeats, 0.95, 1), to_review_words) calculateScore_stopwords(classification(negfeats, posfeats, 0.9, 1), to_review_words) calculateScore_stopwords(classification(negfeats, posfeats, 1, 0.9), to_review_words) print ''' With Lemmatizer ''',"\n\n" negfeats_stopwords = [(word_feats_lemmatize(movie_reviews.words(fileids=[f])), 'neg') for f in negids] posfeats_stopwords = [(word_feats_lemmatize(movie_reviews.words(fileids=[f])), 'pos') for f in posids] calculateScore_lemmatizer(classification(negfeats, posfeats, 1, 1), to_review_words) calculateScore_lemmatizer(classification(negfeats, posfeats, 1, 0.95), to_review_words) calculateScore_lemmatizer(classification(negfeats, posfeats, 0.95, 1), to_review_words) calculateScore_lemmatizer(classification(negfeats, posfeats, 0.9, 1), to_review_words) calculateScore_lemmatizer(classification(negfeats, posfeats, 1, 0.9), to_review_words)
def evaluate_classifier(featx): negids = movie_reviews.fileids('neg') posids = movie_reviews.fileids('pos') count = 1500000 lines = [] english_stops = set(stopwords.words('english')) print ctime(), "Reading files..." f = open('Sentiment Analysis Dataset.csv', "rU") line = f.readline() line = f.readline() negfeats = [] posfeats = [] for i in range(count): lines.append(line) line = f.readline() f.close() random.shuffle(lines) negfeats = [(featx(movie_reviews.words(fileids=[f])), 'neg') for f in negids] posfeats = [(featx(movie_reviews.words(fileids=[f])), 'pos') for f in posids] negcutoff = len(negfeats)*3/4 poscutoff = len(posfeats)*3/4 trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] classifier = NaiveBayesClassifier.train(trainfeats) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, label) in enumerate(testfeats): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats) print 'pos precision:', nltk.metrics.precision(refsets['pos'], testsets['pos']) print 'pos recall:', nltk.metrics.recall(refsets['pos'], testsets['pos']) print 'neg precision:', nltk.metrics.precision(refsets['neg'], testsets['neg']) print 'neg recall:', nltk.metrics.recall(refsets['neg'], testsets['neg']) classifier.show_most_informative_features()
def train_classifier(self): # Training negids = movie_reviews.fileids('neg') posids = movie_reviews.fileids('pos') negfeats = [(self.best_word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids] posfeats = [(self.best_word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids] negcutoff = len(negfeats)*3/4 poscutoff = len(posfeats)*3/4 trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] self.sentiment_classifier = NaiveBayesClassifier.train(trainfeats)
def trainMovies(): negids = movie_reviews.fileids('neg') print type(negids), negids posids = movie_reviews.fileids('pos') negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids] posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids] train = negfeats + posfeats classifier = NaiveBayesClassifier.train(train) f = open('movie_semtiment_classifier.pickle', 'wb') pickle.dump(classifier, f) f.close()
def train(): print('train') test_list = [] logPrior = [] likelihoods = {} big_doc = [] N_docs = len(movie_reviews.fileids()) #vocab = [] #for fileid in movie_reviews.fileids(): # vocab.append(set(w.lower() for w in movie_reviews.words(fileids))) classes = movie_reviews.categories() vocab = [] text_c = '' c_id=0 for c in classes: likelihoods[c_id] = {} class_list = movie_reviews.fileids(c) N_c = len(class_list) logPrior.append(math.log(N_c/N_docs)) #get test and train list tenPercent = int(len(class_list)*.1) lastPercent = int(len(class_list)*.9) testList = class_list[:tenPercent] class_List = class_list[-lastPercent:] test_list.append(testList) #put all c docs into big doc [c] for title in class_list: review = movie_reviews.words(title) #list of words in a review review = ' '.join(review) #make into string to tokenize tok = word_tokenize(review) #tokenize returns a list remove = string.digits + string.punctuation table = str.maketrans('','',remove) filtered = [w.translate(table) for w in tok] stop_words = set(stopwords.words('english')) #common small words filtered = [w for w in filtered if w not in stop_words] #stop words removed filtered = [w for w in filtered if len(w)>0] text = ' '.join(filtered) #get vocab for w in filtered: vocab.append(w) #print("text " + text) text_c = text_c + text + " " #print(text_c) vocab = list(dict.fromkeys(vocab)) big_doc.append(text_c) c_id = c_id + 1 #restart the loop, because needed vocab from both classes c_id = 0 for c in classes: print(c) count_w_v_c = 0 for w in vocab: count_w_v_c = big_doc[c_id].count(w) + 1 + count_w_v_c for w in vocab: count_w_c = big_doc[c_id].count(w) + 1 likelihoods[c_id][w] = math.log(count_w_c/count_w_v_c) c_id = c_id + 1 return logPrior, likelihoods, vocab, test_list
from nltk.corpus import movie_reviews import nltk import random #prepare data set with labels documents = [(list(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] random.shuffle(documents) all_words = nltk.FreqDist( w.lower() for w in movie_reviews.words()) #words,FreqDist() 方法获取到每个单词的出现次数 word_features = list(all_words)[:2000] #checks whether each of these words is present in a given document. def document_features(document): #The reason that we compute the set of all words in a document in [3], rather than just checking if word in document, is that checking whether a word occurs in a set is much faster than checking whether it occurs in a list (4.7). document_words = set(document) features = {} for word in word_features: features['contains({})'.format(word)] = (word in document_words) return features #print(document_features(movie_reviews.words('pos/cv957_8737.txt'))) featuresets = [(document_features(d), c) for (d, c) in documents] train_set = featuresets[100:] test_set = featuresets[:100] classfier = nltk.NaiveBayesClassifier.train(train_set) print(nltk.classify.accuracy(classfier, test_set))
if token in positiveWords: countPos += 1 if token in negativeWords: countNeg += 1 if countPos >= countNeg: features['guess'] = "positive" elif countNeg > countPos: features['guess'] = "negative" return features # prepare review data as a list of tuples: # (list of tokens, category) # category is positive / negative review_data = [(movie_reviews.words(fileid), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] threshold = 10000 # 10000 appears to be the best threshold fd_all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words()) top_words = [word for (word, freq) in fd_all_words.most_common(threshold)] review_data_fdist = [(nltk.FreqDist(token.lower() for token in words if token in top_words), category) for words, category in review_data] # Shuffle data randomly random.seed(42) random.shuffle(review_data_fdist) # Split in training (80 percent) and test set (20 percent)
('This is an amazing place!', 'pos'), ('I feel very good about these beers.', 'pos'), ('This is my best work.', 'pos'), ("What an awesome view", 'pos'), ('I do not like this restaurant', 'neg'), ('I am tired of this stuff.', 'neg'), ("I can't deal with this", 'neg'), ('He is my sworn enemy!', 'neg'), ('My boss is horrible.', 'neg')] test = [('The beer was good.', 'pos'), ('I do not enjoy my job', 'neg'), ("I ain't feeling dandy today.", 'neg'), ("I feel amazing!", 'pos'), ('Gary is a friend of mine.', 'pos'), ("I can't believe I'm doing this.", 'neg')] cl = DecisionTreeClassifier(train) # Grab some movie review data reviews = [(list(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] random.shuffle(reviews) new_train, new_test = reviews[0:100], reviews[100:150] # Update the classifier with the new training data cl.update(new_train) # Compute accuracy accuracy = cl.accuracy(test + new_test) print("Accuracy: {0}".format(accuracy)) # Show 5 most informative features #cl.show_informative_features(5)
import itertools import pandas as pd import numpy as np import string sw = set(stopwords.words('english')) punctuation = set(string.punctuation) all_names = set([name.lower() for name in names.words()]) def isStopWord(word): return (word in sw or word in punctuation) or not word.isalpha() or word in all_names review_words = movie_reviews.words() filtered = [w.lower() for w in review_words if not isStopWord(w.lower())] words = FreqDist(filtered) texts = [] for fid in movie_reviews.fileids(): texts.append(" ".join([ w.lower() for w in movie_reviews.words(fid) if not isStopWord(w.lower()) and words[w.lower()] > 1 ])) vectorizer = TfidfVectorizer(stop_words='english') matrix = vectorizer.fit_transform(texts) sums = np.array(matrix.sum(axis=0)).ravel()
#Read https://stackoverflow.com/questions/10059594/a-simple-explanation-of-naive-bayes-classification for how NaiveBayesClassifier work def naive_bayes_input(words): useful_words = [ word for word in words if word not in stopwords.words('english') ] words_dict = dict([(word, True) for word in useful_words]) return words_dict #This is how NaiveBayesClassifier expects input #The sentiment analysis code is just a machine learning algorithm that has been trained to identify positive/negative reviews. negative_reviews = [] for file in movie_reviews.fileids('neg'): words = movie_reviews.words(file) negative_reviews.append((naive_bayes_input(words), 'negative')) positive_reviews = [] for file in movie_reviews.fileids('pos'): words = movie_reviews.words(file) positive_reviews.append((naive_bayes_input(words), 'positive')) # print(len(negative_reviews), len(positive_reviews)) train_set = negative_reviews[:800] + positive_reviews[:800] test_set = negative_reviews[800:] + positive_reviews[800:] classifier = NaiveBayesClassifier.train(train_set) accuracy = nltk.classify.util.accuracy(classifier, test_set)
# define a 80/20 split for train/test SPLIT = 0.8 # file IDs for the positive and negative reviews posids = movie_reviews.fileids('pos') negids = movie_reviews.fileids('neg') def word_feats(words): feats = defaultdict(lambda: False) for word in words: feats[word] = True return feats posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids] negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids] cutoff = int(len(posfeats) * SPLIT) trainfeats = negfeats[:cutoff] + posfeats[:cutoff] testfeats = negfeats[cutoff:] + posfeats[cutoff:] print 'Train on %d instances' % len(trainfeats) print 'Test on %d instances' % len(testfeats) classifier = NaiveBayesClassifier.train(trainfeats) print 'Accuracy:', nltk.classify.util.accuracy(classifier, testfeats)
from nltk.corpus import movie_reviews import nltk import random import pickle document = [] for category in movie_reviews.categories(): for fileid in movie_reviews.fileids(category): document.append((movie_reviews.words(fileid),category)) random.shuffle(document) all_words = [] for w in movie_reviews.words(): all_words.append(w.lower()) all_words = nltk.FreqDist(all_words) word_features = list(all_words.keys())[:3000] # print(word_features) # document = ["words","category","words","category",..] # word_features = [top 3000 words] def find_features(document): words = set(document) features = {} for w in word_features: features[w] = (w in words) return features
import nltk import random from nltk.corpus import movie_reviews import pickle from nltk.classify.scikitlearn import SklearnClassifier from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB from sklearn.linear_model import SGDClassifier, LogisticRegression from sklearn.svm import SVC, NuSVC documents = [] for category in movie_reviews.categories(): for fileid in movie_reviews.fileids(category): documents.append((list(movie_reviews.words(fileid)), category)) random.shuffle(documents) all_words = [] for w in movie_reviews.words(): all_words.append(w.lower()) all_words = nltk.FreqDist(all_words) word_features = list(all_words.keys())[:3000] def find_features(document): words = set(document) features = {} for w in word_features: features[w] = (w in words) return features
import nltk import random from nltk.corpus import movie_reviews corpus_list = [] corpus_list = [(list(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] random.shuffle(corpus_list) entire_words_list = [] for word in movie_reviews.words(): entire_words_list.append(word.lower()) entire_words_list = nltk.FreqDist(entire_words_list) features_wd = list(entire_words_list.keys())[:4000] def locate_word_features(corpus): words_text = set(corpus) feature_set = {} for word in features_wd: feature_set[word] = (word in words) return feature_set print((locate_word_features(movie_reviews.words('pos/cv000_29590.txt')))) features = [(locate_word_features(reveue), category)
def we_represent(tokens): vec = numpy.zeros(300) for tok in tokens: if tok.lower() in w2v: vec += w2v[tok] return vec training_instances = [] training_labels = [] test_instances = [] test_labels = [] for label in movie_reviews.categories(): for fileid in movie_reviews.fileids(label): doc = movie_reviews.words(fileid) instance = we_represent(doc) if label == 'pos': lbl = 1 else: lbl = 0 if random.randint(0, 9) == 0: test_instances.append(instance) test_labels.append(lbl) else: training_instances.append(instance) training_labels.append(lbl) print(training_instances) print(training_labels) print(test_instances)
for c in self._classifiers: v = c.classify(features) votes.append(v) return mode(votes) def confidence(self, features): votes = [] for c in self._classifiers: v = c.classify(features) votes.append(v) choice_votes = votes.count(mode(votes)) conf = choice_votes / len(votes) return conf documents = [(list(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] random.shuffle(documents) all_words = [] for w in movie_reviews.words(): all_words.append(w.lower()) all_words = nltk.FreqDist(all_words) word_features = list(all_words.keys())[:3000] def find_features(document):
# Text Classification # Import libraries import nltk import random from nltk.corpus import movie_reviews # Create a list of tuples documents = [] for category in movie_reviews.categories(): for fileid in movie_reviews.fileids(category): documents.append((list(movie_reviews.words(fileid)), category)) # Shuffle the documents random.shuffle(documents) print(documents[0]) # Normalize the dataset all_words = [] for w in movie_reviews.words(): all_words.append(w.lower()) # NLTK frequency distribution all_words = nltk.FreqDist(all_words) print(all_words.most_common(15)) print(all_words['love']) # Limit the words word_featuers = list(all_words.keys())[:3000]
def demo_movie_reviews(trainer, n_instances=None, output=None): """ Train classifier on all instances of the Movie Reviews dataset. The corpus has been preprocessed using the default sentence tokenizer and WordPunctTokenizer. Features are composed of: - most frequent unigrams :param trainer: `train` method of a classifier. :param n_instances: the number of total reviews that have to be used for training and testing. Reviews will be equally split between positive and negative. :param output: the output file where results have to be reported. """ from nltk.corpus import movie_reviews from nltk.sentiment import SentimentAnalyzer if n_instances is not None: n_instances = int(n_instances / 2) pos_docs = [(list(movie_reviews.words(pos_id)), "pos") for pos_id in movie_reviews.fileids("pos")[:n_instances]] neg_docs = [(list(movie_reviews.words(neg_id)), "neg") for neg_id in movie_reviews.fileids("neg")[:n_instances]] # We separately split positive and negative instances to keep a balanced # uniform class distribution in both train and test sets. train_pos_docs, test_pos_docs = split_train_test(pos_docs) train_neg_docs, test_neg_docs = split_train_test(neg_docs) training_docs = train_pos_docs + train_neg_docs testing_docs = test_pos_docs + test_neg_docs sentim_analyzer = SentimentAnalyzer() all_words = sentim_analyzer.all_words(training_docs) # Add simple unigram word features unigram_feats = sentim_analyzer.unigram_word_feats(all_words, min_freq=4) sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats) # Apply features to obtain a feature-value representation of our datasets training_set = sentim_analyzer.apply_features(training_docs) test_set = sentim_analyzer.apply_features(testing_docs) classifier = sentim_analyzer.train(trainer, training_set) try: classifier.show_most_informative_features() except AttributeError: print( "Your classifier does not provide a show_most_informative_features() method." ) results = sentim_analyzer.evaluate(test_set) if output: extr = [f.__name__ for f in sentim_analyzer.feat_extractors] output_markdown( output, Dataset="Movie_reviews", Classifier=type(classifier).__name__, Tokenizer="WordPunctTokenizer", Feats=extr, Results=results, Instances=n_instances, )
import nltk.classify.util from nltk.classify import NaiveBayesClassifier from nltk.corpus import movie_reviews def extract_features(word_list): return dict([(word, True) for word in word_list]) if __name__=='__main__': # Load positive and negative reviews positive_fileids = movie_reviews.fileids('pos') negative_fileids = movie_reviews.fileids('neg') features_positive = [(extract_features(movie_reviews.words(fileids=[f])), 'Positive') for f in positive_fileids] features_negative = [(extract_features(movie_reviews.words(fileids=[f])), 'Negative') for f in negative_fileids] # Split the data into train and test (80/20) threshold_factor = 0.8 threshold_positive = int(threshold_factor * len(features_positive)) threshold_negative = int(threshold_factor * len(features_negative)) features_train = features_positive[:threshold_positive] + features_negative[:threshold_negative] features_test = features_positive[threshold_positive:] + features_negative[threshold_negative:] print "\nNumber of training datapoints:", len(features_train) print "Number of test datapoints:", len(features_test) # Train a Naive Bayes classifier classifier = NaiveBayesClassifier.train(features_train) print "\nAccuracy of the classifier:", nltk.classify.util.accuracy(classifier, features_test)
x_test = all_tagged_sents[train_size:] tagger = nltk.UnigramTagger(train=x_train,backoff=nltk.DefaultTagger('n')) tokens = nltk.word_tokenize(u'我 认为 不丹 的 被动 卷入 不 构成 此次 对峙 的 主要 因素。') tagged = tagger.tag(tokens) #["我", "R"], ["认为", "V"], ["不丹", "n"], ["的", "U"], ["被动", "A"], ["卷入", "V"], ["不", "D"], ["构成", "V"], ["此次", "R"], ["对峙", "V"], ["的", "U"], ["主要", "B"], ["因素。", "n"] print (tagger.evaluate(x_test)) #0.871 ####################################################NLTK学习之三:文本分类与构建基于分类的词性标注器 ##1、文本分类示例 import random import nltk from nltk.corpus import movie_reviews docs = [(list(movie_reviews.words(fileid)),category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] random.shuffle(docs) all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words()) most_comment_word = [word for (word,_) in all_words.most_common(2000)] def doc_feature(doc): doc_words = set(doc) feature = {} for word in most_comment_word: feature[word] = (word in doc_words) return feature
negative_fileids = movie_reviews.fileids('neg') def build_bag(words): word_dict = {} for word in words: if word not in exclude: word = unicodedata.normalize( 'NFKD', word).encode('ascii', 'ignore') word_dict[word] = 1 return word_dict pos_bag = [] for f in positive_fileids: pos_bag.append((build_bag(movie_reviews.words(fileids=[f])), 'pos')) neg_bag = [] for f in negative_fileids: neg_bag.append((build_bag(movie_reviews.words(fileids=[f])), 'neg')) split = 800 sentiment_classifier = NaiveBayesClassifier.train( pos_bag[:split] + neg_bag[:split]) train_accuracy = nltk.classify.util.accuracy( sentiment_classifier, pos_bag[:split] + neg_bag[:split])*100 test_accuracy = nltk.classify.util.accuracy( sentiment_classifier, pos_bag[split:] + neg_bag[split:])*100
import nltk import random from nltk.corpus import movie_reviews from nltk.classify.scikitlearn import SklearnClassifier from sklearn.naive_bayes import BernoulliNB, MultinomialNB from sklearn.linear_model import LogisticRegression, SGDClassifier from sklearn.svm import SVC, LinearSVC, NuSVC documents = [] for category in movie_reviews.categories(): for fileid in movie_reviews.fileids(category): t = () t = (list(movie_reviews.words(fileid)), category) documents.append(t) random.shuffle(documents) #print(documents[0]) all_words = [] for w in movie_reviews.words(): all_words.append(w.lower()) all_words = nltk.FreqDist(all_words) word_features = list(all_words.keys())[:3000] def find_features(document): words = set(document) features = {} for w in word_features:
# -*- coding: utf-8 -*- """ Created on Wed Jun 28 07:44:58 2017 @author: Naruto_kathi """ import nltk import random from nltk.corpus import movie_reviews import pickle documents = [(list(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] random.shuffle(documents) allwords = list(w.lower() for w in movie_reviews.words()) allwords = nltk.FreqDist(allwords) #print(allwords.most_common(20)) word_features = list(allwords.keys())[:3000] #print(word_features,"List of top 50 most common words") def find_features(document): words = set(document) features = {} for w in word_features:
dataset = list() for fileid in fileids: fileid_count = fileid_count + 1 review = set(movie_reviews.words(fileid)) features = dict() for word in featureset: features[word] = word in review pos_or_neg = fileid[:3] dataset.append((features, pos_or_neg)) return dataset # Collect all the words in the training examples vocabulary = set() for fileid in train_fileids: for word in movie_reviews.words(fileid): vocabulary.add(word) # Try a feature set of 500 random words vocabulary = list(vocabulary) random.shuffle(vocabulary) random_featureset = vocabulary[:500] train_set = format_dataset(train_fileids, random_featureset) test_set = format_dataset(test_fileids, random_featureset) bayes = NaiveBayesClassifier.train(train_set) print("Random words: ", random_featureset) print("Naive Bayes accuracy:", accuracy(bayes, test_set)) # Try a feature set of the 500 words that appear most often in the training examples
#pylint: disable=C0103 '''Classifying a movie review based''' import string from itertools import chain from nltk.corpus import movie_reviews as mr from nltk.corpus import stopwords from nltk.probability import FreqDist from nltk.classify import NaiveBayesClassifier as nbc import nltk stop = stopwords.words('english') documents = [([w for w in mr.words(i) if w.lower() not in stop and w.lower() not in string.punctuation], i.split('/')[0]) for i in mr.fileids()] word_features = FreqDist(chain(*[i for i,j in documents])) word_features = list(word_features.keys())[:100] numtrain = int(len(documents) * 90 / 100) train_set = [({i:(i in tokens) for i in word_features}, tag) for tokens,tag in documents[:numtrain]] test_set = [({i:(i in tokens) for i in word_features}, tag) for tokens,tag in documents[numtrain:]] classifier = nbc.train(train_set) print((nltk.classify.accuracy(classifier, test_set))) classifier.show_most_informative_features(5)
# In[90]: nltk.download("punkt") # In[91]: romeo_words = nltk.word_tokenize(romeo_text) # In[92]: romeo_words #now the punctiuations are seperated nicely # In[93]: movie_reviews.words(fileids=positive_fileids[0]) # In[94]: #simplest way tfor analysing text is to thing about words as an unordered collection of words #dictionary {word: True for word in romeo_words} # In[95]: type(_) # '_' is the output from last code i.e. the line above # In[96]:
def train(self, feats): print "Starting to train the data" start = datetime.datetime.now() print "setting the ids", datetime.datetime.now() self.negids = movie_reviews.fileids('neg') self.posids = movie_reviews.fileids('pos') #random.shuffle(self.negids) #random.shuffle(self.posids) ##self.reviews = ([(movie_reviews.words(fileids=[f]), 'neg') for f in self.negids] + ##[(movie_reviews.words(fileids=[f]), 'pos') for f in self.posids]) ##random.shuffle(self.reviews) ##self.train_set = apply_features(feats, self.reviews[len(self.reviews)*1/4:]) ##self.test_set = apply_features(feats, self.reviews[:len(self.reviews)*1/4]) print "setting the feats", datetime.datetime.now() self.negfeats = [(feats(movie_reviews.words(fileids=[f])), 'neg') for f in self.negids] self.posfeats = [(feats(movie_reviews.words(fileids=[f])), 'pos') for f in self.posids] self.negcutoff = len(self.negfeats) * 3 / 4 self.poscutoff = len(self.posfeats) * 3 / 4 print "setting the train/test", datetime.datetime.now() self.trainfeats = self.negfeats[:self. negcutoff] + self.posfeats[:self. poscutoff] self.testfeats = self.negfeats[self.negcutoff:] + self.posfeats[ self.poscutoff:] print "training", datetime.datetime.now() self.classifier = NaiveBayesClassifier.train(self.trainfeats) ##self.classifier = NaiveBayesClassifier.train(self.train_set) self.refsets = defaultdict(set) self.testsets = defaultdict(set) print "accuracy stuff", datetime.datetime.now() for i, (feats, label) in enumerate(self.testfeats): ##for i, (feats, label) in enumerate(self.test_set): self.refsets[label].add(i) observed = self.classifier.classify(feats) self.testsets[observed].add(i) end = datetime.datetime.now() print "Training lasted for ", end - start print 'accuracy:', nltk.classify.util.accuracy(self.classifier, self.testfeats) ##print 'accuracy:', nltk.classify.util.accuracy(self.classifier, self.test_set) print 'pos precision:', nltk.metrics.precision(self.refsets['pos'], self.testsets['pos']) print 'pos recall:', nltk.metrics.recall(self.refsets['pos'], self.testsets['pos']) print 'neg precision:', nltk.metrics.precision(self.refsets['neg'], self.testsets['neg']) print 'neg recall:', nltk.metrics.recall(self.refsets['neg'], self.testsets['neg']) self.classifier.show_most_informative_features() self.trained = True
# random.shuffle(documents)f # print(documents[1]) for i in mr.fileids(): documents[i.split('/')[0]].append(i) random.shuffle(documents['pos']) random.shuffle(documents['neg']) #print(documents['pos'][:10]) # first ten pos reviews. #print #print(documents['neg'][:10]) # first ten neg reviews. documents = [([ w for w in mr.words(i) if w.lower() not in stop and w.lower() not in string.punctuation ], i.split('/')[0]) for i in mr.fileids()] random.shuffle(documents) allWords = [] for w in mr.words(): allWords.append(w.lower()) allWords = nltk.FreqDist(allWords) #print(allWords.most_common(15)) #print(allWords["stupid"]) wordFeatures = list(allWords.keys())[:3000]
# AUTHOR: GIRISH SRINIVAS import nltk, random from nltk.corpus import movie_reviews from nltk.corpus import wordnet as wn all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words()) word_features = all_words.keys()[:2000] def document_features4a(document): document_words = set(document) features = {} for word in word_features: if len(wn.synsets(word)) > 0: features['contains(%s)' % word] = 'KNOWN' else: features['contains(%s)' % word] = 'UNK' return features def main(): documents = [(list(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] random.shuffle(documents) featuresets = [(document_features4a(d), c) for (d, c) in documents] train_set, test_set = featuresets[100:], featuresets[:100] classifier = nltk.NaiveBayesClassifier.train(train_set) print nltk.classify.accuracy(classifier, test_set) classifier.show_most_informative_features(5)
import nltk import random from nltk.corpus import movie_reviews import pickle import os.path from nltk.classify.scikitlearn import SklearnClassifier from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB from sklearn.linear_model import LogisticRegression, SGDClassifier from sklearn.svm import SVC, LinearSVC, NuSVC documents = [(list(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] random.shuffle(documents) all_words = [] for w in movie_reviews.words(): all_words.append(w.lower()) all_words = nltk.FreqDist(all_words) word_features = list(all_words.keys())[:3000] def find_features(document): words = set(document) features = {} for w in word_features: features[w] = (w in words) return features
#TEXT CLASSIFICATION import nltk import random from nltk.corpus import movie_reviews documents = [ (list(movie_reviews.words(fileid)), category ) #it contain all the words which has class label as positive or negative for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category) ] random.shuffle( documents ) #bcz postive and negative example are together...1000 postivie than 1000 negative all_words = [] for w in movie_reviews.words( ): #append all the words in a list. so that we can classify. all_words.append(w.lower()) all_words = nltk.FreqDist(all_words) word_features = list(all_words.keys())[:3000] def find_features(document): words = set(document) features = {}
from nltk.corpus import movie_reviews from nltk.classify import NaiveBayesClassifier from nltk.classify.util import accuracy as nltk_accuracy # Extract features from the input list of words def extract_features(words): return dict([(word, True) for word in words]) if __name__=='__main__': # Load the reviews from the corpus fileids_pos = movie_reviews.fileids('pos') fileids_neg = movie_reviews.fileids('neg') # Extract the features from the reviews features_pos = [(extract_features(movie_reviews.words( fileids=[f])), 'Positive') for f in fileids_pos] features_neg = [(extract_features(movie_reviews.words( fileids=[f])), 'Negative') for f in fileids_neg] # Define the train and test split (80% and 20%) threshold = 0.8 num_pos = int(threshold * len(features_pos)) num_neg = int(threshold * len(features_neg)) # Create training and training datasets features_train = features_pos[:num_pos] + features_neg[:num_neg] features_test = features_pos[num_pos:] + features_neg[num_neg:] # Print the number of datapoints used print('\nNumber of training datapoints:', len(features_train)) print('Number of test datapoints:', len(features_test))
import collections import nltk.metrics from nltk.metrics import precision, recall, f_measure from nltk.classify import NaiveBayesClassifier from nltk.corpus import movie_reviews from nltk.corpus import stopwords import itertools from nltk.collocations import BigramCollocationFinder from nltk.metrics import BigramAssocMeasures from nltk.probability import FreqDist, ConditionalFreqDist for word in movie_reviews.words(categories=['pos']): print(word)
import nltk from nltk.corpus import movie_reviews import random # step 1 object set reviews = [(movie_reviews.words(reviewid), clas) for clas in ['pos', 'neg'] for reviewid in movie_reviews.fileids(clas)] # step 1.5 get the most common 100 JJs sample = random.sample(reviews, 200) pos = [nltk.pos_tag(x) for x, y in sample] jjlist = [] for i in pos: jj = [ x[0] for x in i if "JJ" in x[1] and len(x[0]) >= 3 and x[0].isalpha() ] jjlist += jj frequency = nltk.FreqDist(jjlist) common_jj = [x for x, y in frequency.most_common(100)] # step2 feature function def feature(text): res = {} for i in common_jj: res[i] = i in set([x.lower() for x in text]) return res #step 3 generate the feature set