def prep_reviews_data(self): # messy code to test classifier with movie reviews if not self.movie_review_data: print 'Preparing movie reviews...\n' from nltk.corpus import movie_reviews docs = [movie_reviews.raw(fileid) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] process = lambda x: 1 if x == 'pos' else -1 labels = [process(category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] docs, labels = double_shuffle(docs, labels) training, testing = divide_list_by_ratio(docs) self.train_labs, self.test_labs = divide_list_by_ratio(labels) train_vecs = self.vectorizer.fit_transform(training) test_vecs = self.vectorizer.transform(testing) if isinstance(self.model, naive_bayes.GaussianNB): train_vecs = train_vecs.toarray() test_vecs = test_vecs.toarray() self.train_vecs = train_vecs self.test_vecs = test_vecs self.movie_review_data = True self.news_market_data = False
def download_data_if_not_yet(): """ Download the data set, if the data set is not download. """ try: # make sure that nltk can find the data if paddle.v2.dataset.common.DATA_HOME not in nltk.data.path: nltk.data.path.append(paddle.v2.dataset.common.DATA_HOME) movie_reviews.categories() except LookupError: print "Downloading movie_reviews data set, please wait....." nltk.download( 'movie_reviews', download_dir=paddle.v2.dataset.common.DATA_HOME) print "Download data set success....." print "Path is " + nltk.data.find('corpora/movie_reviews').path
def f2c(corpus,fileName): if corpus=='mr': from nltk.corpus import movie_reviews as mr return mr.categories(fileids = fileName)[0] else: from nltk.corpus import reuters return reuters.categories(fileids = fileName)[0]
def category_by_movie(): from nltk.corpus import movie_reviews as mr from nltk import FreqDist from nltk import NaiveBayesClassifier from nltk import classify from nltk.corpus import names from nltk.classify import apply_features import random documents = [(list(mr.words(f)), c) for c in mr.categories() for f in mr.fileids(c)] random.shuffle(documents) all_words = FreqDist(w.lower() for w in mr.words()) word_features = all_words.keys()[:2000] def document_features(document): document_words = set(document) features = {} for word in word_features: features['contains(%s)' % word] = (word in document_words) return features #print document_features(mr.words('pos/cv957_8737.txt')) #print documents[0] features = [(document_features(d), c) for (d, c) in documents] train_set, test_set = features[100:], features[:100] classifier = NaiveBayesClassifier.train(train_set) print classify.accuracy(classifier, train_set)
def create_sentiment(): """ Train sentiment model and save. Input type: None Output: Model as pickle """ random.seed(1) test = [ ("The dude presenting Unravel seems like one of the most genuine game developers Ive ever seen I really hope this game works out for him",'pos'), ("His hands are shaking Dude looks so stoked and scared at the same time",'pos'), ("Right I just felt like I was watching his dream come true It was nice The game looks very well done as well Good for him",'pos'), ("Seriously Unravel looks really good actually and honestly seeing him so happy about what hes made is contagious I want to see more of Unravel ",'pos'), ("He was so nervous shaking all over his voice quivering",'neg'), ("The game looked nice too very cute art style ",'pos'), ("You could tell he genuinely wanted to be there it looked like he was even shaking from the excitement I hope it works out for them aswell",'pos'), ("However following that up with the weird PvZ thing was odd To say the least",'neg'), ("Haha The game did look nice though Im definitely going to keep an eye on it I enjoy supporting such hopeful developers",'pos'), ("Very personable This looks like a buy for me As a dev in a other sector I appreciate this passion",'pos'), ("I want to give him a cookie",'pos'), ("Im getting a copy Im gonna support my indie devs",'pos'), ("The twitch leak was accurate It was like a play by play you start speaking French then switch to English",'neg'), ("yep exactly what i was thinking lol its important to note that the twitch leak never had them saying it was Dishonored 2 but that they were honored to be here very different",'neg'), ("Honored Im 100 sure that was intentional",'neg'), ("oh yea for sure but wasnt solid enough evidence imo to be like dishonored 2 confirmed just based off that",'neg'), ("The confirmation was who was talking not what they were talking about ",'neg'), ("How awkward is it for a pop singer to perform at a video game conference",'neg'), ("Oh god did they warn him that he will get zero reaction",'neg'), ("I really hope so",'pos'), ("Almost as bad as Aisha f*****g up her dialogue constantly Shes doing alright though E3 is really becoming a mainstream media event Hollywood has nothing like this ComicCon is the only comparison and they dont dazzle it up like E3",'neg') ] # Grab review data reviews = [ (list(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category) ] random.shuffle(reviews) # Divide into 10% train/test splits new_train, new_test = reviews[:1900], reviews[1900:] # Train the NB classifier on the train split cl = NaiveBayesClassifier(new_train) # Compute accuracy accuracy = cl.accuracy(test + new_test) print("Accuracy: {0}".format(accuracy)) # Show 5 most informative features cl.show_informative_features(5) # Save model for use in creating social model sentiment with open('sentiment_clf_full.pkl', 'wb') as pk: pickle.dump(cl, pk) print 'done saving model'
def documentClassification(): from nltk.corpus import movie_reviews documents = [(list(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] random.shuffle(documents) all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words()) word_features = all_words.keys()[:2000] def document_features(document): document_words = set(document) features = {} for word in word_features: features['contains(%s)' % word] = (word in document_words) return features print document_features(movie_reviews.words('pos/cv957_8737.txt')) featuresets = [(document_features(d), c) for (d,c) in documents] train_set, test_set = featuresets[100:], featuresets[:100] classifier = nltk.NaiveBayesClassifier.train(train_set) print nltk.classify.accuracy(classifier, test_set) classifier.show_most_informative_features(5)
def createFeatureSet(numOfExamples): documents = [(list(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)[:numOfExamples]] with open('documents.txt', 'wb') as f: pickle.dump(documents, f) ## #read from file ## with open('documents.txt', 'rb') as f: ## documents = pickle.load(f) random.shuffle(documents) all_words = [] ## for w in movie_reviews.words(): ## all_words.append(w.lower()) #write to file ## with open('allwords.txt', 'wb') as f: ## pickle.dump(all_words, f) #read from file with open('allwords.txt', 'rb') as f: all_words = pickle.load(f) freqDist = nltk.FreqDist(all_words) #print('freq dist') #print(freqDist.most_common(50)) word_features = freqDist.most_common(3000) featuresets = [(find_features(rev, word_features), category) for (rev, category) in documents] return featuresets
def prepareSentimentClassifier(): documents = [(list(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] random.shuffle(documents) all_words = [] for w in movie_reviews.words(): all_words.append(w.lower()) all_words = nltk.FreqDist(all_words) global word_featuresSent word_featuresSent = list(all_words.keys())[:3000] featuresets = [(findFeaturesSentiment(rev), category) for (rev, category) in documents] training_set = featuresets[:1900] testing_set = featuresets[1900:] sentimentClassifier = nltk.NaiveBayesClassifier.train(training_set) print("Classifier accuracy percent:",(nltk.classify.accuracy(sentimentClassifier, testing_set))*100) return sentimentClassifier
def create_dataset(): '''Create dataset from movie reviews dataset''' documents = [(list(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] random.shuffle(documents) return documents
def main(): documents = [(list(movie_reviews.words(fileid)),category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] random.shuffle(documents) featuresets = [(document_features8b(d), c) for (d,c) in documents] train_set, test_set = featuresets[100:], featuresets[:100] classifier = nltk.NaiveBayesClassifier.train(train_set) print nltk.classify.accuracy(classifier, test_set)
def __init__(self): self.documents = [(list(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] random.shuffle(self.documents) all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words()) word_features = all_words.keys()[:2000]
def data_run(): # print "Preparing Data..." labels = movie_reviews.categories() labeled_words = [(l, movie_reviews.words(categories=[l])) for l in labels] high_info_words = set(Params.high_information_words(labeled_words)) feat_det = lambda words: Params.bag_of_words_in_set(words, high_info_words) feats = Train.label_feat_from_corps(movie_reviews, feature_detector=feat_det) return Train.split_label_feats(feats)
def label_docs(): docs = [(list(movie_reviews.words(fid)), cat) for cat in movie_reviews.categories() for fid in movie_reviews.fileids(cat)] random.seed(42) random.shuffle(docs) return docs
def main(): """ Sample training using the movie reviews corpus (Pang, Lee). """ #== load inputs documents = np.array([movie_reviews.raw(review_id) for category in movie_reviews.categories() for review_id in movie_reviews.fileids(category)]) sentiment_scores = np.array([0 if category == 'neg' else 1 for category in movie_reviews.categories() for review_id in movie_reviews.fileids(category)]) #== select random indices n = documents.shape[0] indices = np.random.permutation(n) threshold = np.floor(n*0.8) # 80% training set / 20% test set train_idx, test_idx = indices[:threshold], indices[threshold:] #== select training and validation sets according to these indicies x_train, x_test = documents[:, train_idx], documents[:, test_idx] y_train, y_test = sentiment_scores[:, train_idx], sentiment_scores[:, test_idx] #== train the model print '===== Training the model...' sentiment = SentimentMachine(x_train.tolist(), y_train.tolist()) w = sentiment.train(speed=0.001, stochastic=False) print '===== Model trained.' #== test efficiency of the model print '===== Testing the model...' # compute the MSE h = lambda a,b: sigmoid(np.dot(a,b)) x = sentiment.compute_features_matrix(x_test.tolist()) mse = cost(w, x, y_test, h) # compute the number of valid classifications n_test = y_test.shape[0] valid = 0 for i in xrange(n_test): valid += 1 if sentiment.classify(x_test[i]) == y_test[i] else 0 percent = 100.0 * valid / n_test # print results print ('== Number of well-classified documents: {0} / {1} ({2}%)' .format(valid, n_test, percent)) print '== Cost value on the test set: %.4f' % mse
def get_documents(): """ Retrieve shuffled movie reviews from the nltk """ print("Retrieving Movie Reviews\n") reviews = [(list(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] # so I have no idea why, but shuffle() gives me a none type return random.sample(reviews, len(reviews))
def getdata(percentsplit=80, numofdocs=100): docs = [(list(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] random.shuffle(docs) if numofdocs == 0: numofdocs = len(docs) print len(docs) numoftrainingexamples = numofdocs * percentsplit / 100 traindocs, testdocs = docs[:numoftrainingexamples], docs[numoftrainingexamples:] return traindocs, testdocs
def classify_document(): from nltk.corpus import movie_reviews import random documents = [(list(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] random.shuffle(documents) all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words()) allwords = [w for w, _ in all_words.most_common(2000)] featuresets = [(document_features(d, allwords), c) for d, c in documents] return classify(nltk.NaiveBayesClassifier, featuresets, 0.1)
def get_testset_trainset_nltk_mr(train_to_test_ratio=0.3): from nltk.corpus import movie_reviews as mr train_test = [[],[]] for category in mr.categories(): categories_file_name_dict[category]=mr.fileids(categories=category) for cat in categories_file_name_dict.keys(): li = categories_file_name_dict[cat] size=int(len(li)*train_to_test_ratio) cat_num_docs[cat]=size train_test[0].extend(li[:size]) train_test[1].extend(li[size:]) return train_test
def train_func(func1=Sys_Params.remove_punctuation, func2=Sys_Params.non_stop_words, func3=Sys_Params.do_pos, func4=Sys_Params.do_lmtize_pos, func5=Sys_Params.high_information_words, flag=2): tst = "This should be a GOOD TEST" if flag != 3: def func_final(tst): tst = func1(tst) tst = func2(tst) tst = func3(tst) ans = func4(tst) # ans = Sys_Params.bag_of_words(tst) return ans # final_func = Sys_Params.bag_of_words(func4(func3(func2(func1())))) print "Passing the function" labels = movie_reviews.categories() labeled_words = [(l, movie_reviews.words(categories=[l])) for l in labels] high_info_words = set(Sys_Params.high_information_words(labeled_words)) feat_det = lambda words: Sys_Params.bag_of_words_in_set(func_final(words), high_info_words) # feat_det = lambda words: Sys_Params.bag_of_words_in_set(words, high_info_words) feats = label_feat_from_corps(movie_reviews, feature_detector=feat_det) # # print final_func # return final_func elif flag == 3: def func_final(tst): tst = func1(tst) tst = func2(tst) tst = func3(tst) tst = func4(tst) ans = Sys_Params.bag_of_words(tst) return ans # final_func = Sys_Params.bag_of_words(func4(func3(func2(func1())))) print "Passing the function" feats = label_feat_from_corps(movie_reviews, func_final) # feats = label_feat_from_corps(movie_reviews) # print final_func # return final_func training, testing = split_label_feats(feats) Classifier_NB.run(training) nb_Classifier = Classifier_NB.load_classifier() Classifier_DT.run(training) dt_Classifier = Classifier_DT.load_classifier() Classifier_ME.run(training) me_Classifier = Classifier_ME.load_classifier() inst = Classifier_MV.MaxVoteClassifier(nb_Classifier, dt_Classifier, me_Classifier) inst.save_classifier(inst) print "******DONE TRAINING******" return
def load_movie_reviews(): reviews = [(list(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] random.shuffle(reviews) print('' + str(len(reviews)) + ' reviews loaded') # 2000 if False: doc = reviews[0] print('doc type: ' + str(type(doc)) + ' length: ' + str(len(doc))) for elem in doc: print('elem type: ' + str(type(elem)) + ' length: ' + str(len(elem))) ''' doc type: <type 'tuple'> length: 2 elem type: <type 'list'> length: 711 <- array of words in the movie review elem type: <type 'str'> length: 3 <- 'pos', meaning positive review ''' return reviews
def document_classification_movie_reviews(): from nltk.corpus import movie_reviews documents = [(list(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] random.shuffle(documents) # use the most frequest 2000 words as features all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words()) word_features = all_words.keys()[:2000] featuresets = [(_document_features(d, word_features), category) for (d,category) in documents] train_set, test_set = featuresets[100:], featuresets[:100] classifier = nltk.NaiveBayesClassifier.train(train_set) print nltk.classify.accuracy(classifier, test_set) classifier.show_most_informative_features(30)
def get_data(): """ Get movie review data """ dataset = [] y_labels = [] #extract cateogires for cat in movie_reviews.categories(): #for files in each category for fileid in movie_reviews.fileids(cat): #get the words in that category words = list(movie_reviews.words(fileid)) dataset.append((words, cat)) y_labels.append(cat) return dataset,y_labels
def __init__(self): self.documents = [(list(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] random.shuffle(self.documents) self.stopset = set(stopwords.words('english')) self.K_FOLDS = 10 self.fullfeatures = [] self.features_X = [] self.features_Y = [] self.negative = ["wasn\'t", 'don\'t', 'not', 'bad', 'worst', 'ugly', 'hate'] self.end = ['\,', '\.'] self.negationFeatures = []
def movie_reviews_words(): # Vou colocar todos os documentos for category in movie_reviews.categories(): for fileid in movie_reviews.fileids(category): documents.append((movie_reviews.words(fileid), category)) #random.shuffle(documents) #Retorna desse jeito: ([u'plot', u':', u'a', u'human', u'space', u'astronaut', ...], u'pos') print(documents[0]) print(documents[1800]) # print('\n') # Vai ver quais sao as words mais frequentes em todos os documentos all_words = nltk.FreqDist(movie_reviews.words()) #retorna tuplas (words, frequency) return all_words
def book_train(self, feats): documents = [(list(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] random.shuffle(documents) self.all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words() if w not in self.stopset) self.word_features = self.all_words.keys()[:1000] featuresets = [(self.text_feats(d), c) for (d,c) in documents] train_set, test_set = featuresets[200:], featuresets[:200] self.classifier = nltk.NaiveBayesClassifier.train(train_set) print nltk.classify.accuracy(self.classifier, test_set) self.classifier.show_most_informative_features(10)
def train_func_default(): print "Passing the function" labels = movie_reviews.categories() labeled_words = [(l, movie_reviews.words(categories=[l])) for l in labels] high_info_words = set(Sys_Params.high_information_words(labeled_words)) feat_det = lambda words: Sys_Params.bag_of_words_in_set(words, high_info_words) feats = label_feat_from_corps(movie_reviews, feature_detector=feat_det) training, testing = split_label_feats(feats) Classifier_NB.run(training) nb_Classifier = Classifier_NB.load_classifier() Classifier_DT.run(training) dt_Classifier = Classifier_DT.load_classifier() Classifier_ME.run(training) me_Classifier = Classifier_ME.load_classifier() inst = Classifier_MV.MaxVoteClassifier(nb_Classifier, dt_Classifier, me_Classifier) inst.save_classifier(inst) print "******DONE TRAINING******" return
def get_word_dict(): """ Sorted the words by the frequency of words which occur in sample :return: words_freq_sorted """ words_freq_sorted = list() word_freq_dict = collections.defaultdict(int) download_data_if_not_yet() for category in movie_reviews.categories(): for field in movie_reviews.fileids(category): for words in movie_reviews.words(field): word_freq_dict[words] += 1 words_sort_list = word_freq_dict.items() words_sort_list.sort(cmp=lambda a, b: b[1] - a[1]) for index, word in enumerate(words_sort_list): words_freq_sorted.append((word[0], index)) return words_freq_sorted
def train_naive_bayes(): ''' Trains a naive bayes classifier using the nltk movie reviews corpus to movie relevant text as positive or negative NOTE: Easily generalizable to different domain given similarly structued corpus ''' #Modified http://www.nltk.org/book/ch06.html documents = [(list(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] random.shuffle(documents) all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words()) global word_features word_features = list(all_words)[:3000] featuresets = [(document_features(d), c) for (d, c) in documents] train_set, test_set = featuresets[1000:], featuresets[:1000] classifier = nltk.NaiveBayesClassifier.train(train_set) return classifier
def main(): #from nltk.corpus import names #names = ([(name, 'male') for name in names.words('male.txt')] + # [(name, 'female') for name in names.words('female.txt')]) #random.shuffle(names) #print names #print #train_set = names #classifier = nltk.NaiveBayesClassifier.train(train_set) #name = classifier.classify("Jaime") #print name #featuresets = [(gender_features(n), g) for (n,g) in names] #train_set = [(gender_features(n), g) for (n, g) in names] #train_set, test_set = featuresets[500:], featuresets[:500] #print train_set #print #classifier = nltk.NaiveBayesClassifier.train(train_set) #name = classifier.classify(gender_features('Neo')) #print name documents = [(list(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] random.shuffle(documents) featuresets = [(document_features(d), c) for (d,c) in documents] train_set, test_set = featuresets[100:], featuresets[:100] print train_set print classifier = nltk.NaiveBayesClassifier.train(train_set) print nltk.classify.accuracy(classifier, test_set) classifier.show_most_informative_features(5) print 'done'
def makePrediction(): labels = movie_reviews.categories() print("Labels for reviews are: {}\n".format(labels) ) labeled_words = [(label, movie_reviews.words(categories=[label])) for label in labels] print("Labeled words:{}\n".format(labeled_words[:10])) high_info_words = set(Toolbox.high_information_words(labeled_words)) print("High information words:{}\n".format(list(high_info_words)[:10])) feat_det = lambda words: Toolbox.bag_of_words_in_set(words, high_info_words) lfeats = Toolbox.label_feats_from_corpus(movie_reviews, feature_detector=feat_det) train_feats, test_feats = Toolbox.split_label_feats(lfeats) mv_classifier = ClassifierTrainer.trainClassifier(train_feats) accuracyScore = accuracy(mv_classifier, test_feats) print("Accuracy is {}".format(accuracyScore))
import random from nltk.corpus import movie_reviews from review_sentiment import ReviewSentiment import classification if __name__ == '__main__': labeled_data = [(movie_reviews.raw(fileids=fileid), movie_reviews.categories(fileid)[0]) for fileid in movie_reviews.fileids()] random.seed(1234) random.shuffle(labeled_data) labeled_data = labeled_data[:100] rs = ReviewSentiment(labeled_data, train_size=50) classifiers = classification.train(rs) classification.evaluate(rs, classifiers) classifier = classifiers[0][0] print() print("positive reviews prediction") classification.predict(rs, "data/positive/", classifier, 0) print() print("negative reviews prediction") classification.predict(rs, "data/negative/", classifier, 0)
# Naive Bayes Classifier import nltk import math from nltk.corpus import movie_reviews, stopwords # get our movie reviews from nltk.corpus (reviews stored as tuples (review, class)) documents = [(movie_reviews.raw(fileid), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] classes = movie_reviews.categories() # [’pos’, ’neg’] trainingSet = documents[100:900] + documents[1100:1900] devSet = documents[900:1000] + documents[1900:] testSet = documents[:100] + documents[1000:1100] def train(trainingSet, classes): # train the data n = len(trainingSet) # total number of docs log_prior = {} # dictionary to hold log prior for all cases fulltext = "" # dictionary that holds bigdoc for each class bigdoc_dict = {} # dictionary that holds number of docs in each class num_docs = {} for c in classes: bigdoc_dict[c] = ""
votes.append(v) return mode(votes) def confidence(self, features): votes = [] for c in self._classifiers: v = c.classify(features) votes.append(v) choice_votes = votes.count(mode(votes)) conf = choice_votes / len(votes) return conf documents = [(list(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] #random.shuffle(documents) all_words = [] for w in movie_reviews.words(): all_words.append(w.lower()) all_words = nltk.FreqDist(all_words) word_features = list(all_words.keys())[:3000] def find_features(document):
ne_nrr = ne_chunk(ne_tag) new = "The cat ate the little mouse who was after fresh cheese" new_token = nltk.pos_tag(word_tokenize(new)) grammer_np = r"NP: {<DT>?<JJ>*<NN>}" chunk_praser = nltk.RegexpParser(grammer_np) chunk_result = chunk_praser.parse(new_token) import pandas as pd import numpy as np import sklearn from sklearn.feature_extraction.text import CountVectorizer from nltk.corpus import movie_reviews movie_reviews.categories() pos_rev = movie_reviews.fileids("pos") neg_rev = movie_reviews.fileids("neg") rev = nltk.corpus.movie_reviews.words('pos/cv565_29572.txt') rev_list = [] for rev in neg_rev: rev_text_neg = rev = nltk.corpus.movie_reviews.words(rev) review_one_string = " ".join(rev_text_neg) review_one_string = review_one_string.replace(",", ",") review_one_string = review_one_string.replace(".", ".") review_one_string = review_one_string.replace("\' ", "'") review_one_string = review_one_string.replace(" \'", "'") rev_list.append(review_one_string) len(rev_list)
from sklearn.svm import SVC, LinearSVC, NuSVC from nltk.classify import ClassifierI from statistics import mode import pandas as pd import numpy as np from sklearn.feature_extraction.text import CountVectorizer import nltk import os import nltk.corpus print(os.listdir(nltk.data.find('corpora'))) from nltk.corpus import movie_reviews print(movie_reviews.categories()) print(len(movie_reviews.fileids('pos'))) print() print(movie_reviews.fileids('pos')) neg_rev = movie_reviews.fileids('neg') print(len(neg_rev)) print(neg_rev) rev = nltk.corpus.movie_reviews.words('pos/cv000_29590.txt') print(rev) rev_list = [] for rev in neg_rev: rev_text_neg = rev = nltk.corpus.movie_reviews.words(rev) review_one_string = " ".join(rev_text_neg)
import nltk from nltk.corpus import movie_reviews from pylab import plot,show from numpy import array from numpy.random import rand from scipy.cluster.vq import kmeans,vq,whiten import numpy as np import random documents = [(' '.join(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] #转化为词列表的影评,与标签,组成二元组 random.shuffle(documents) documents_words=[w for (w,t) in documents] from sklearn import feature_extraction from sklearn.feature_extraction.text import TfidfTransformer from sklearn.feature_extraction.text import CountVectorizer vectorizer=CountVectorizer(min_df=20,stop_words='english')#该类会将文本中的词语转换为词频矩阵,矩阵元素a[i][j] 表示j词在i类文本下的词频 transformer=TfidfTransformer()#该类会统计每个词语的tf-idf权值 tfidf=transformer.fit_transform(vectorizer.fit_transform(documents_words))#fit_transform计算tf-idf,fit_transform将文本转为词频矩阵 word=vectorizer.get_feature_names()#获取词袋模型中的所有词语 features=tfidf.toarray()#将tf-idf矩阵抽取出来,元素a[i][j]表示j词在i类文本中的tf-idf权重 target=[c for (d,c) in documents] data=whiten(features) centroids,_ = kmeans(data,2) idx,_ = vq(data,centroids) target1=[1 if x =='pos' else 0 for x in target]
print(w5.wup_similarity(w4)) ###################### Text Classification ########################## from nltk.corpus import movie_reviews #already labeled import random import nltk #category: pos neg get words vectors of every file in movie reviews documents=[(list(movie_reviews.words(fileid)),category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] #tuples #words: features makes up elements random.shuffle(documents) #洗牌 all_words=[] for w in movie_reviews.words(): all_words.append(w.lower()) # nltk words frequency distribution #39768 in total allwords=nltk.FreqDist(all_words) print(allwords.most_common(15))
import nltk nltk.download('reuters') from nltk.corpus import reuters reuters.fileids() reuters.categories() fileid = 'test/16399' text = reuters.raw(fileid) text1=reuters.raw(categories='zinc') reuters.categories(fileid) import nltk nltk.download('movie_reviews') from nltk.corpus import movie_reviews movie_reviews.fileids() movie_reviews.categories() fileid = 'neg/cv956_12547.txt' text = movie_reviews.raw(fileid) text1= movie_reviews.raw(categories='neg') movie_reviews.categories(fileid) #Frequency distribution by creating our own corpus from nltk.corpus import PlaintextCorpusReader fileid = 'C:/Users/ITRAIN-12/Desktop/Day 2/gaming.txt' my_corpus = PlaintextCorpusReader(fileid, '.*') text = my_corpus.raw(fileid) text my_corpus.raw(fileid) my_corpus.words(fileid)
############################################################################# from featx import label_feats_from_corpus, split_label_feats, high_information_words, bag_of_words_in_set from classification import precision_recall # classification.py debe estar en el mismo dir. from nltk.corpus import movie_reviews from nltk.classify.util import accuracy from nltk.classify import NaiveBayesClassifier from nltk.classify import MaxentClassifier from nltk.classify import DecisionTreeClassifier from nltk.classify.scikitlearn import SklearnClassifier from sklearn.svm import LinearSVC # "high information word" es una palabra que esta fuertemente sesgada hacia una unica etiqueta de clasificacion # "low information words" es una o varias palabras que son comunes en todas las etiquetas de clasificacion. labels = movie_reviews.categories() labeled_words = [(l, movie_reviews.words(categories=[l])) for l in labels] high_info_words = set(high_information_words(labeled_words)) feat_det = lambda words: bag_of_words_in_set(words, high_info_words) lfeats = label_feats_from_corpus(movie_reviews, feature_detector=feat_det) train_feats, test_feats = split_label_feats(lfeats) print("######################################################################") nb_classifier = NaiveBayesClassifier.train(train_feats) print("Accuracy Naive Bayes: " + str(accuracy(nb_classifier, test_feats))) # Accuracy: 0.91 nb_precisions, nb_recalls = precision_recall(nb_classifier, test_feats) print("Precisions Naive Bayes Pos: " + str(nb_precisions['pos'])) # Precisions Pos: 0.8988326848249028
#!/usr/bin/env python # coding: utf-8 # In[27]: import nltk from nltk import NaiveBayesClassifier from nltk.corpus import movie_reviews import random # In[28]: cats = movie_reviews.categories() reviews = [] for a in cats: for fid in movie_reviews.fileids(a): review = (list(movie_reviews.words(fid)), a) reviews.append(review) random.shuffle(reviews) # In[3]: all_wd_in_reviews = nltk.FreqDist(wd.lower() for wd in movie_reviews.words()) top_wd_in_reviews = [ list(wds) for wds in zip(*all_wd_in_reviews.most_common(2000)) ][0] # In[4]: def ext_ft(review, top_words):
def main(): # - categories 'neg' and 'pos' # - fileid ex - 'neg/cv000_29416.txt' documents = [(list(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] random.shuffle(documents) training_set = documents[:1600] #1600 sentences in training set held_out_set = documents[-400:-200] #200 sentences in held out set test_set = documents[-200:] # 200 sentences in test set sentiment = Sentiment() sentiment.Set_uni_bi_stat(training_set) sentiment.Conclude_uni_bi_stat() real_labels = sentiment.Real_labels(held_out_set) # MaxEnt with only Bigram features print "--- MaxEnt---" cutoffs1 = [0, 25, 50, 100, 250, 500, 1000] cutoffs2 = [0, 12, 25, 50, 125, 250, 500] sentiment.Uni_count_list() sentiment.Bi_count_list() # with info gain print "--- Unigram + Bigram with information gain---" sentiment.Uni_high_score_list() sentiment.Bi_high_score_list() for cutoff in cutoffs2: print "uni features: 500" print "bi features: " + str(cutoff) trainsets_uni = sentiment.Feature_set_uni(500, training_set, 2) trainsets_bi = sentiment.Feature_set_bi(cutoff, training_set, 2) # combine the feature set of unigram and bigram trainsets_uni_bi = [] for i in range(len(trainsets_uni)): temp = dict(trainsets_uni[i][0]) temp.update(trainsets_bi[i][0]) trainsets_uni_bi.append((temp, trainsets_uni[i][1])) held_out_sets_uni = sentiment.Feature_set_uni(500, held_out_set, 2) held_out_sets_bi = sentiment.Feature_set_bi(cutoff, held_out_set, 2) held_out_sets_uni_bi = [] for i in range(len(held_out_sets_uni)): temp = dict(held_out_sets_uni[i][0]) temp.update(held_out_sets_bi[i][0]) held_out_sets_uni_bi.append((temp, held_out_sets_uni[i][1])) algorithm = nltk.classify.MaxentClassifier.ALGORITHMS[0] classifier_uni_bi = nltk.MaxentClassifier.train(trainsets_uni_bi, algorithm, max_iter=50) classifier_uni_bi.show_most_informative_features(10) print "\nClassifier Accuracy : %4f\n" % nltk.classify.accuracy( classifier_uni_bi, held_out_sets_uni_bi) predicted_labels_uni_bi = sentiment.Maxent_predicted( classifier_uni_bi, held_out_sets_uni_bi) sentiment.Evaluation(predicted_labels_uni_bi, real_labels) # without infor grain print "--- Unigram + Bigram without info gain---" for cutoff in cutoffs1: print "uni features: 1000" print "bi features: " + str(cutoff) trainsets_uni = sentiment.Feature_set_uni(1000, training_set, 1) trainsets_bi = sentiment.Feature_set_bi(cutoff, training_set, 1) # combine the feature set of unigram and bigram trainsets_uni_bi = [] for i in range(len(trainsets_uni)): temp = dict(trainsets_uni[i][0]) temp.update(trainsets_bi[i][0]) trainsets_uni_bi.append((temp, trainsets_uni[i][1])) held_out_sets_uni = sentiment.Feature_set_uni(1000, held_out_set, 1) held_out_sets_bi = sentiment.Feature_set_bi(cutoff, held_out_set, 1) held_out_sets_uni_bi = [] for i in range(len(held_out_sets_uni)): temp = dict(held_out_sets_uni[i][0]) temp.update(held_out_sets_bi[i][0]) held_out_sets_uni_bi.append((temp, held_out_sets_uni[i][1])) algorithm = nltk.classify.MaxentClassifier.ALGORITHMS[0] classifier_uni_bi = nltk.MaxentClassifier.train(trainsets_uni_bi, algorithm, max_iter=50) classifier_uni_bi.show_most_informative_features(10) print "\nClassifier Accuracy : %4f\n" % nltk.classify.accuracy( classifier_uni_bi, held_out_sets_uni_bi) predicted_labels_uni_bi = sentiment.Maxent_predicted( classifier_uni_bi, held_out_sets_uni_bi) sentiment.Evaluation(predicted_labels_uni_bi, real_labels)
def main(): # - categories 'neg' and 'pos' # - fileid ex - 'neg/cv000_29416.txt' documents = [(list(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] random.shuffle(documents) (documents_adj, documents_adj_adv, documents_adj_adv_v) = POS(documents) cutoffs = [100, 200, 500, 1000, 2000, 3000] # adj only print "--- adj only ---" training_set_adj = documents_adj[:1600] #1600 sentences in training set held_out_set_adj = documents_adj[-400:-200] #200 sentences in held out set test_set_adj = documents_adj[-200:] # 200 sentences in test out set sentiment_adj = Sentiment() sentiment_adj.Set_word_stat(training_set_adj) sentiment_adj.Conclude_word_stat() real_labels_adj = sentiment_adj.Real_labels(held_out_set_adj) # without info gain sentiment_adj.Uni_count_list() for cutoff in cutoffs: print "cutoff = " + str(cutoff) trainsets_adj = sentiment_adj.Feature_set_uni(cutoff, training_set_adj, 1) held_out_sets_adj = sentiment_adj.Feature_set_uni(cutoff, held_out_set_adj, 1) algorithm = nltk.classify.MaxentClassifier.ALGORITHMS[0] classifier_adj = nltk.MaxentClassifier.train(trainsets_adj, algorithm, max_iter = 50) classifier_adj.show_most_informative_features(10) print "\nClassifier Accuracy : %4f\n" % nltk.classify.accuracy(classifier_adj, held_out_sets_adj) predicted_labels_adj = sentiment_adj.Maxent_predicted(classifier_adj, held_out_sets_adj) sentiment_adj.Evaluation(predicted_labels_adj, real_labels_adj) # with info gain print "--- adj only with information gain---" sentiment_adj.Uni_high_score_list() for cutoff in cutoffs: print "cutoff = " + str(cutoff) trainsets_adj = sentiment_adj.Feature_set_uni(cutoff, training_set_adj, 2) held_out_sets_adj = sentiment_adj.Feature_set_uni(cutoff, held_out_set_adj, 2) algorithm = nltk.classify.MaxentClassifier.ALGORITHMS[0] classifier_adj = nltk.MaxentClassifier.train(trainsets_adj, algorithm, max_iter = 50) classifier_adj.show_most_informative_features(10) print "\nClassifier Accuracy : %4f\n" % nltk.classify.accuracy(classifier_adj, held_out_sets_adj) predicted_labels_adj = sentiment_adj.Maxent_predicted(classifier_adj, held_out_sets_adj) sentiment_adj.Evaluation(predicted_labels_adj, real_labels_adj) # adj + adv print "--- adj + adv ---" training_set_adj_adv = documents_adj_adv[:1600] #1600 sentences in training set held_out_set_adj_adv = documents_adj_adv[-400:-200] #200 sentences in held out set test_set_adj_adv = documents_adj_adv[-200:] # 200 sentences in test set sentiment_adj_adv = Sentiment() sentiment_adj_adv.Set_word_stat(training_set_adj_adv) sentiment_adj_adv.Conclude_word_stat() real_labels_adj_adv = sentiment_adj_adv.Real_labels(held_out_set_adj_adv) # without info gain sentiment_adj_adv.Uni_count_list() for cutoff in cutoffs: print "cutoff = " + str(cutoff) trainsets_adj_adv = sentiment_adj_adv.Feature_set_uni(cutoff, training_set_adj_adv, 1) held_out_sets_adj_adv = sentiment_adj_adv.Feature_set_uni(cutoff, held_out_set_adj_adv, 1) algorithm = nltk.classify.MaxentClassifier.ALGORITHMS[0] classifier_adj_adv = nltk.MaxentClassifier.train(trainsets_adj_adv, algorithm, max_iter = 50) classifier_adj_adv.show_most_informative_features(10) print "\nClassifier Accuracy : %4f\n" % nltk.classify.accuracy(classifier_adj_adv, held_out_sets_adj_adv) predicted_labels_adj_adv = sentiment_adj_adv.Maxent_predicted(classifier_adj_adv, held_out_sets_adj_adv) sentiment_adj_adv.Evaluation(predicted_labels_adj_adv, real_labels_adj_adv) # with info gain print "--- adj + adv with information gain---" sentiment_adj_adv.Uni_high_score_list() for cutoff in cutoffs: print "cutoff = " + str(cutoff) trainsets_adj_adv = sentiment_adj_adv.Feature_set_uni(cutoff, training_set_adj_adv, 2) held_out_sets_adj_adv = sentiment_adj_adv.Feature_set_uni(cutoff, held_out_set_adj_adv, 2) algorithm = nltk.classify.MaxentClassifier.ALGORITHMS[0] classifier_adj_adv = nltk.MaxentClassifier.train(trainsets_adj_adv, algorithm, max_iter = 50) classifier_adj_adv.show_most_informative_features(10) print "\nClassifier Accuracy : %4f\n" % nltk.classify.accuracy(classifier_adj_adv, held_out_sets_adj_adv) predicted_labels_adj_adv = sentiment_adj_adv.Maxent_predicted(classifier_adj_adv, held_out_sets_adj_adv) sentiment_adj_adv.Evaluation(predicted_labels_adj_adv, real_labels_adj_adv) # adj + adv + v print "--- adj + adv + v---" training_set_adj_adv_v = documents_adj_adv_v[:1600] #1600 sentences in training set held_out_set_adj_adv_v = documents_adj_adv_v[-400:-200] #200 sentences in held out set test_set_adj_adv_v = documents_adj_adv_v[-200:] # 200 sentences in test set sentiment_adj_adv_v = Sentiment() sentiment_adj_adv_v.Set_word_stat(training_set_adj_adv_v) sentiment_adj_adv_v.Conclude_word_stat() real_labels_adj_adv_v = sentiment_adj_adv_v.Real_labels(held_out_set_adj_adv_v) # without info gain sentiment_adj_adv_v.Uni_count_list() for cutoff in cutoffs: print "cutoff = " + str(cutoff) trainsets_adj_adv_v = sentiment_adj_adv_v.Feature_set_uni(cutoff, training_set_adj_adv_v, 1) held_out_sets_adj_adv_v = sentiment_adj_adv_v.Feature_set_uni(cutoff, held_out_set_adj_adv_v, 1) algorithm = nltk.classify.MaxentClassifier.ALGORITHMS[0] classifier_adj_adv_v = nltk.MaxentClassifier.train(trainsets_adj_adv_v, algorithm, max_iter = 50) classifier_adj_adv_v.show_most_informative_features(10) print "\nClassifier Accuracy : %4f\n" % nltk.classify.accuracy(classifier_adj_adv_v, held_out_sets_adj_adv_v) predicted_labels_adj_adv_v = sentiment_adj_adv_v.Maxent_predicted(classifier_adj_adv_v, held_out_sets_adj_adv_v) sentiment_adj_adv_v.Evaluation(predicted_labels_adj_adv_v, real_labels_adj_adv_v) # without info gain print "--- adj + adv + v with information gain---" sentiment_adj_adv_v.Uni_high_score_list() for cutoff in cutoffs: print "cutoff = " + str(cutoff) trainsets_adj_adv_v = sentiment_adj_adv_v.Feature_set_uni(cutoff, training_set_adj_adv_v, 2) held_out_sets_adj_adv_v = sentiment_adj_adv_v.Feature_set_uni(cutoff, held_out_set_adj_adv_v, 2) algorithm = nltk.classify.MaxentClassifier.ALGORITHMS[0] classifier_adj_adv_v = nltk.MaxentClassifier.train(trainsets_adj_adv_v, algorithm, max_iter = 50) classifier_adj_adv_v.show_most_informative_features(10) print "\nClassifier Accuracy : %4f\n" % nltk.classify.accuracy(classifier_adj_adv_v, held_out_sets_adj_adv_v) predicted_labels_adj_adv_v = sentiment_adj_adv_v.Maxent_predicted(classifier_adj_adv_v, held_out_sets_adj_adv_v) sentiment_adj_adv_v.Evaluation(predicted_labels_adj_adv_v, real_labels_adj_adv_v)
def preprocess(checkpoint=True): """ Reads, gives format and concatenate data frames into one. :param checkpoint: True to save data frame: bool """ # getting nltk dataset: documents = [(movie_reviews.raw(fileid), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] # data framing nltk_df = pd.DataFrame() for review, category in documents: temp = pd.DataFrame(data={ 'text': review, 'category': category }, index=[0]) nltk_df = nltk_df.append(temp) nltk_df.reset_index(drop=True, inplace=True) nltk_df['category'] = nltk_df['category'].map(lambda x: 0 if x == 'neg' else 1) # getting tweets dataset from stanford: tweets_df = pd.read_csv( '/Mining_The_Social_Web/datasets/tweetsstanford_training.csv', sep=',', header=None, names=['category', 'id', 'date', 'query', 'user', 'text']) tweets_df['category'] = tweets_df['category'].map(lambda x: 1 if x == 4 else 0) # getting dataset from University of Michigan: umich_df = pd.read_csv( '/Mining_The_Social_Web/datasets/umich_training.txt', sep="\t", header=None, names=['category', 'text']) # getting reviews dataset from Amazon: amazon_df = pd.read_csv( '/Mining_The_Social_Web/datasets/amazon_cells_labelled.txt', sep="\t", header=None, names=['text', 'category']) # getting review dataset from IMDB imdb_df = pd.read_csv('/Mining_The_Social_Web/datasets/imdb_labelled.txt', sep="\t", header=None, names=['text', 'category']) # getting review dataset from Yelp yelp_df = pd.read_csv('/Mining_The_Social_Web/datasets/yelp_labelled.txt', sep="\t", header=None, names=['text', 'category']) # concatenate ALL: trainset_df = pd.concat([ nltk_df, tweets_df[['category', 'text']], umich_df, yelp_df, imdb_df, amazon_df ]) trainset_df.reset_index(drop=True, inplace=True) if checkpoint: trainset_df.to_csv( path_or_buf='/Mining_The_Social_Web/datasets/alltrainset.csv', header=['category', 'text'], columns=['category', 'text'], index=None, sep='\t', mode='w') return trainset_df
def test_movie_reviews(): """ http://www.cs.cornell.edu/people/pabo/movie-review-data/ http://www.nltk.org/book/ch06.html https://streamhacker.com/2010/05/10/text-classification-sentiment-analysis-naive-bayes-classifier/ https://streamhacker.com/2010/05/24/text-classification-sentiment-analysis-stopwords-collocations/ https://streamhacker.com/2010/06/16/text-classification-sentiment-analysis-eliminate-low-information-features/ evaluating single word features accuracy: 0.728 pos precision: 0.651595744681 pos recall: 0.98 neg precision: 0.959677419355 neg recall: 0.476 evaluating best word features accuracy: 0.93 pos precision: 0.890909090909 pos recall: 0.98 neg precision: 0.977777777778 neg recall: 0.88 Signficant Bigrams evaluating best words + bigram chi_sq word features accuracy: 0.92 pos precision: 0.913385826772 pos recall: 0.928 neg precision: 0.926829268293 neg recall: 0.912 NaiveBayesClassifier train on 1900 instances, test on 100 instances pos precision: 0.7435897435897436 pos recall: 0.5370370370370371 pos F-measure: 0.6236559139784946 neg precision: 0.5901639344262295 neg recall: 0.782608695652174 neg F-measure: 0.6728971962616822 Rules-based SentimentAnalyzer pos precision:0.6031746031746031 pos recall:0.7037037037037037 pos F-measure:0.6495726495726496 neg precision:0.5675675675675675 neg recall:0.45652173913043476 neg F-measure:0.5060240963855421 """ from nltk.corpus import movie_reviews from nltk.metrics import precision, recall, f_measure from nltk.classify import NaiveBayesClassifier import random import collections # data documents = [(list(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] #random.shuffle(documents) # SET this line is only present for some debug reason. Remove it when the development is done. documents = documents[:200] train_docs = documents[100:] test_docs = documents[:100] # negids = movie_reviews.fileids('neg') # posids = movie_reviews.fileids('pos') # negfeats = [(featx(movie_reviews.words(fileids=[f])), 'neg') for f in negids] # posfeats = [(featx(movie_reviews.words(fileids=[f])), 'pos') for f in posids] # negcutoff = len(negfeats)*3/4 # poscutoff = len(posfeats)*3/4 # trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] # testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] # Machine Learning approach do_ML = False # SET refsets = collections.defaultdict(set) if do_ML: print ('NaiveBayesClassifier') # preprocessing print ('+ preprocessing') all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words()) word_features = list(all_words)[:2000] def document_features(document): document_words = set(document) features = {} for word in word_features: features['contains({})'.format(word)] = (word in document_words) return features train_featuresets = [(document_features(d), c) for (d,c) in train_docs] test_featuresets = [(document_features(d), c) for (d,c) in test_docs] # training print ('+ train on %d instances' % (len(train_featuresets))) classifier = nltk.NaiveBayesClassifier.train(train_featuresets) # testing print ('+ test on %d instances' % ( len(test_featuresets))) classifier_hypsets = collections.defaultdict(set) for i, (feats, label) in enumerate(test_featuresets): refsets[label].add(i) classifier_hyp = classifier.classify(feats) classifier_hypsets[classifier_hyp].add(i) print ('pos precision:', precision(refsets['pos'], classifier_hypsets['pos'])) print ('pos recall:', recall(refsets['pos'], classifier_hypsets['pos'])) print ('pos F-measure:', f_measure(refsets['pos'], classifier_hypsets['pos'])) print ('neg precision:', precision(refsets['neg'], classifier_hypsets['neg'])) print ('neg recall:', recall(refsets['neg'], classifier_hypsets['neg'])) print ('neg F-measure:', f_measure(refsets['neg'], classifier_hypsets['neg'])) # print ('Rules-based SentimentAnalyzer') sa = SentimentAnalyzer() # preprocessing print ('+ preprocessing') def pyrata_structure_as_features (doc): tokens_pos = nltk.pos_tag(doc) pyrata_tokens = [{'raw':w, 'pos':p, 'lc':w.lower()} for (w, p) in tokens_pos] return pyrata_tokens train_featuresets = [(pyrata_structure_as_features(d), c) for (d,c) in train_docs] test_featuresets = [(pyrata_structure_as_features(d), c) for (d,c) in test_docs] # print ('+ train on %d instances' % (len(train_featuresets))) for i, (doc, label) in enumerate(train_featuresets): print ('Debug: label={}'.format(label)) sa.booster_extraction(doc) # testing print ('+ test on %d instances' % (len(test_featuresets))) rules_based_hypsets = collections.defaultdict(set) for i, (doc, label) in enumerate(test_featuresets): #print ('Debug: doc={}'.format(doc)) rules_based_hyp = sa.label_polarity(doc) rules_based_hypsets[rules_based_hyp].add(i) print ('pos precision:{:10}'.format(precision(refsets['pos'], rules_based_hypsets['pos']))) print ('pos recall:{:10}'.format(recall(refsets['pos'], rules_based_hypsets['pos']))) print ('pos F-measure:{:10}'.format(f_measure(refsets['pos'], rules_based_hypsets['pos']))) print ('neg precision:{:10}'.format(precision(refsets['neg'], rules_based_hypsets['neg']))) print ('neg recall:{:10}'.format(recall(refsets['neg'], rules_based_hypsets['neg']))) print ('neg F-measure:{:10}'.format(f_measure(refsets['neg'], rules_based_hypsets['neg'])))
def main(): global documents global all_words # contains all the words in movie reviews global word_features # list containing all the word features global featuresets global training_set global testing_set documents = [(list(movie_reviews.words(fileid)), category) # [(review, category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] all_words = [] # documents = [] # for category in movie_reviews.categories(): # for fileid in movie_reviews.fileids(category): # documents.append(list(movie_reviews.words(fileid)), category) random.shuffle(documents) # to prevent extreme bias # print(documents[1]) for w in movie_reviews.words(): all_words.append(w.lower()) # we need to convert all words to lower case all_words = nltk.FreqDist(all_words) word_features = list(all_words.keys())[:3000] # top 3000 words # print(all_words.most_common(15)) # prints out the 15 most common words # print("Number of times stupid pops up {}".format(all_words["stupid"])) # print((find_features(movie_reviews.words('neg/cv000_29416.txt'), word_features))) featuresets = [(find_features(rev, word_features), category) for (rev, category) in documents] training_set = featuresets[:1900] testing_set = featuresets[1900:] classifier = nltk.NaiveBayesClassifier.train(training_set) print("Naive Bayes Algorithm accuracy: ", (nltk.classify.accuracy(classifier, testing_set)) * 100) classifier.show_most_informative_features(15) # save_classifier = open("naivebayes.pickle", "wb") # pickle.dump(classifier, save_classifier) # save_classifier.close() classifier_f = open("naivebayes.pickle", "rb") classifier = pickle.load(classifier_f) print("Naive Bayes Algorithm accuracy: ", (nltk.classify.accuracy(classifier, testing_set)) * 100) classifier_f.close() # MultinomialNB MNB_classifier = SklearnClassifier(MultinomialNB()) MNB_classifier.train(training_set) print("MNB Classifier Algorithm accuracy: ", (nltk.classify.accuracy(MNB_classifier, testing_set)) * 100) # GaussianNB # GaussianNB_classifier = SklearnClassifier(GaussianNB()) # GaussianNB_classifier.train(training_set) # print("GaussianNB Classifier Algorithm accuracy: ", (nltk.classify.accuracy(GaussianNB_classifier, testing_set)) * 100) # BernoulliNB BernoulliNB_classifier = SklearnClassifier(BernoulliNB()) BernoulliNB_classifier.train(training_set) print("BernoulliNB Classifier Algorithm accuracy: ", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set)) * 100) # LogisticRegression, SGDClassifier # SVC, LinearSVC, NuSVC # Logistic Regression LogisticRegression_classifier = SklearnClassifier(LogisticRegression()) LogisticRegression_classifier.train(training_set) print("LogisticRegression Classifier Algorithm accuracy: ", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set)) * 100) # SGDClassifier SGDClassifier_classifier = SklearnClassifier(SGDClassifier()) SGDClassifier_classifier.train(training_set) print("SGDClassifier Classifier Algorithm accuracy: ", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set)) * 100) # SVC SVC_classifier = SklearnClassifier(SVC()) SVC_classifier.train(training_set) print("SVC Classifier Algorithm accuracy: ", (nltk.classify.accuracy(SVC_classifier, testing_set)) * 100) # LinearSVC LinearSVC_classifier = SklearnClassifier(LinearSVC()) LinearSVC_classifier.train(training_set) print("LinearSVC Classifier Algorithm accuracy: ", (nltk.classify.accuracy(LinearSVC_classifier, testing_set)) * 100) # NuSVC NuSVC_classifier = SklearnClassifier(NuSVC()) NuSVC_classifier.train(training_set) print("NuSVC Classifier Algorithm accuracy: ", (nltk.classify.accuracy(NuSVC_classifier, testing_set)) * 100)
def evaluate(test, category_list): ''' Function returns number of reviews the tags of which we predicted correctly. ''' return 100.0*sum(1 for x in test_model(test, category_list) if x[0] == mr.categories(x[1])[0])/len(test)
import nltk import random from nltk.corpus import movie_reviews print(movie_reviews.categories()) # ['pos','neg'] docs = [(list(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] random.shuffle(docs) all_words = [] for w in movie_reviews.words(): all_words.append(w) all_words = nltk.FreqDist(all_words) print(all_words("gorgeous")) #50
from nltk.corpus import movie_reviews print(movie_reviews.categories(movie_reviews.fileids()[10]))
import nltk.classify.util from nltk.classify import NaiveBayesClassifier from nltk.corpus import movie_reviews nltk.download("stopwords") def extract_features(word_list): return dict([(word, True) for word in word_list]) # Create a list of movie review document documents = [] for category in movie_reviews.categories(): for fileid in movie_reviews.fileids(category): # documents.append((list(movie_reviews.words(fileid)), category)) documents.append((movie_reviews.words(fileid), category)) if __name__ == '__main__': # Load positive and negative reviews positive_fileids = movie_reviews.fileids('pos') negative_fileids = movie_reviews.fileids('neg') # print("No.of.postive fileds",positive_fileids) # print("No.of.Negative fields",negative_fileids) # Total reviews print("Total No.of.Reviews in Movies", len(movie_reviews.fileids())) # Output: 2000 # Review categories print("Categoriacal variables",
""" Sentiment Analysis-Movie Reviews using NLTK @author: Sathish Sampath([email protected]) Developed as part of Microsoft's NLP MOOC(https://www.edx.org/course/natural-language-processing-nlp) """ # movie reviews / sentiment analysis import nltk from nltk.corpus import movie_reviews as reviews import random # Input Documents docs = [(list(reviews.words(id)), cat) for cat in reviews.categories() for id in reviews.fileids(cat)] # Shuffle the input random.shuffle(docs) fd = nltk.FreqDist(word.lower() for word in reviews.words()) topKeys = [key for (key, value) in fd.most_common(2000)] def review_features(doc): docSet = set(doc) features = {} for word in topKeys: features[word] = (word in docSet)
import nltk import random from nltk.corpus import movie_reviews documents = [] for category in movie_reviews.categories(): # category is pos or neg for fileid in movie_reviews.fileids(category): documents.append((list(movie_reviews.words(fileid)), category)) random.shuffle(documents) #print(documents[1]) all_words = [] for w in movie_reviews.words(): all_words.append(w.lower()) all_words = nltk.FreqDist(all_words) # convert into a format that nlzk can use print(all_words.most_common(15)) print(all_words["stupid"] ) # how much times does this word get used in the reviews
s = s.encode('utf8') if s == predictions[p]: correct += 1 p = p + 1 return (correct / float(len(test))) * 100.0 #The calling code voc = [] #Contains vocabularies voc = movie_reviews.words() docs = [] #Contains all docs irrespective of category docs = movie_reviews.fileids() C = [] C = movie_reviews.categories() ##Contains all the categories #splitting the data set splitRatio_train = 0.60 splitRatio_test = 0.20 splitRatio_cross = 0.20 train, test, cross = splitDataset(docs, splitRatio_train, splitRatio_test, splitRatio_cross) #Calling the TrainBernoulli function with the training data V, prior, condprob = TrainBernoulli(C, train) #Calling the ApplyBernoulliNB function score = [] #List that stores the predicted class labels of the test data score = ApplyBernoulliNB(C, V, prior, condprob, test)
""" ################ MOVIE REVIEWS - SENTIMENT ANALYSIS ############## ################ IMPORT DATA AND EXPLORE ######################### # Importing nltk & random package import nltk import random # Imporing the dataset & stopwords corpus from nltk.corpus import stopwords from nltk.corpus import movie_reviews # Creating documents list which stores file name and its category as pos or neg documents = [(list(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] # we randomly shuffle the documents before creating training and testing datasets random.shuffle(documents) # randomly chosing 40th document to see its content print(documents[40]) # Listing categories movie_reviews.categories() # Listing unique file ids movie_reviews.fileids() # Finding out number of categories
nltk.download('words') from nltk.corpus import wordnet as wn from nltk.corpus import movie_reviews from nltk.corpus import sentiwordnet as wdn from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer from nltk.stem.porter import PorterStemmer from nltk.util import ngrams from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS pp = pprint.PrettyPrinter(indent=4) neg, pos = movie_reviews.categories() new_phrases = [] for ids in movie_reviews.fileids(neg): for phrase in movie_reviews.sents(ids)[1:]: if len(phrase) > 3: new_phrases.append({ 'type': 'neg', 'phrase': ' '.join(phrase).lower(), 'pos_score': 0.0, 'neg_score': 0.0, 'over_score': 0.0 }) for ids in movie_reviews.fileids(pos): for phrase in movie_reviews.sents(ids): if len(phrase) > 3:
w2v = KeyedVectors.load_word2vec_format("wiki.en.5k.vec", binary=False) #print("Number of words: %d" % len(w2v.vocab)) def we_represent(tokens): vec = numpy.zeros(300) for tok in tokens: if tok.lower() in w2v: vec += w2v[tok] return vec training_instances = [] training_labels = [] test_instances = [] test_labels = [] for label in movie_reviews.categories(): for fileid in movie_reviews.fileids(label): doc = movie_reviews.words(fileid) instance = we_represent(doc) if label == 'pos': lbl = 1 else: lbl = 0 if random.randint(0, 9) == 0: test_instances.append(instance) test_labels.append(lbl) else: training_instances.append(instance) training_labels.append(lbl) print(training_instances)
def setup_reviews(): global documents documents += [(find_features(list(movie_reviews.words(fileid))), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)]
def main(): DOC_SIZE = 1000 TRAIN_SIZE = int(DOC_SIZE * 0.75) pos_files = movie_reviews.fileids(categories='pos')[:DOC_SIZE] neg_files = movie_reviews.fileids(categories='neg')[:DOC_SIZE] train_pos_files = pos_files[:TRAIN_SIZE] train_neg_files = neg_files[:TRAIN_SIZE] test_pos_files = pos_files[TRAIN_SIZE:] test_neg_files = neg_files[TRAIN_SIZE:] print('Corpus Size: {}'.format(len(pos_files + neg_files))) print('Training Size:\n\tpositive: {}\tnegative: {}'.format(len(train_pos_files), len(train_neg_files))) print('Testing Size:\n\tpositive: {}\tnegative: {}'.format(len(test_pos_files), len(test_neg_files))) print() # training datasets datasets = create_bunch(train_pos_files + train_neg_files) text_train, y_train = datasets.data, datasets.target # testing datasets datasets = create_bunch(test_pos_files + test_neg_files) text_test, y_test = datasets.data, datasets.target # vectorize training and testing data sets vectorizer = CountVectorizer(min_df=5, ngram_range=(2, 2)) x_train = vectorizer.fit(text_train).transform(text_train) x_test = vectorizer.transform(text_test) # vocabulary features = vectorizer.get_feature_names() # print(features) param_grid = {'C': [0.001, 0.01, 0.1, 1, 10]} grid = GridSearchCV(LogisticRegression(solver='lbfgs', max_iter=200), param_grid, cv=5) grid.fit(x_train, y_train) lr = grid.best_estimator_ lr.fit(x_train, y_train) lr.predict(x_test) print("Accuracy score: {:.2f}".format(lr.score(x_test, y_test))) print() print() # predictions print('Predicting movie reviews using Logical Regression classifier:') print('prediction:') print('[0] => negative\n[1] => positive') print() test_datasets = test_pos_files + test_neg_files random.shuffle(test_datasets) for i in range(10): r = random.randint(0, len(test_datasets) - 1) f = test_datasets[r] actual = movie_reviews.categories(f) raw = [movie_reviews.raw(f)] predict = lr.predict(vectorizer.transform(raw)) print('Test doc: {}\t\tactual class: {}\t\tprediction: {}'.format(r, actual, predict))
def getDocuments(self): categories = movie_reviews.categories() documents = [(fileid, category) for category in categories for fileid in movie_reviews.fileids(category)] return documents
# predict=classifier.classify(gender_features(name)) # if(predict!=tag): # errors.append((name,tag,predict)) # errors.sort() # for name,tag, predict in errors[:10]: # print("name=%-30s "%name+"tag=%-8s"%tag+"predict=%-8s"%predict) # ########################################################################### # 文档分类模型的建立 # # ############################################################################# #建立文档 这里使用movie文档中的内容,对文档进行正向和负向的评价 from nltk.corpus import movie_reviews import random document = [(list(movie_reviews.words(fileid)), tag) for tag in movie_reviews.categories() for fileid in movie_reviews.fileids(tag)] random.shuffle(document) # print(document[:10]) #构建特征,如何构建呢?最简单的构建方式是改词是否在words库中出现过,所以就是one-hot模型 all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words()) word_features = list(all_words.keys())[:2000] def get_feature_doc(document): words = set(document) features = {} for word in word_features: features["contains{0}".format(word)] = word in words return features
Python 3.8.2 (tags/v3.8.2:7b3ab59, Feb 25 2020, 23:03:10) [MSC v.1916 64 bit (AMD64)] on win32 Type "help", "copyright", "credits" or "license()" for more information. >>> from nltk.corpus import movie_reviews >>> print (len(movie_reviews.fileids())) #total reviews 2000 >>> print (movie_reviews.categories()) #review categories ['neg', 'pos'] >>> print (len(movie_reviews.fileids('pos'))) #pos reviews 1000 >>> print (len(movie_reviews.fileids('neg'))) #neg reviews 1000 >>> positive_review_file = movie_reviews.fileids('pos')[0] >>> print (positive_review_file) pos/cv000_29590.txt >>> documents = [] #creating a movie review document >>> for category in movie_reviews.categories(): for fileid in movie_reviews.fileids(category): documents.append((movie_reviews.words(fileid), category)) >>> print (len(documents)) 2000 >>> x = [str(item) for item in documents[0][0]] >>> print(x) ['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party', ',', 'drink', 'and', 'then', 'drive', '.', 'they', 'get', 'into', 'an', 'accident', '.', 'one', 'of', 'the', 'guys', 'dies', ',', 'but', 'his', 'girlfriend', 'continues', 'to', 'see', 'him', 'in', 'her', 'life', ',', 'and', 'has', 'nightmares', '.', 'what', "'", 's', 'the', 'deal', '?', 'watch', 'the', 'movie', 'and', '"', 'sorta', '"', 'find', 'out', '.', '.', '.', 'critique', ':', 'a', 'mind', '-', 'f**k', 'movie', 'for', 'the', 'teen', 'generation', 'that', 'touches', 'on', 'a', 'very', 'cool', 'idea', ',', 'but', 'presents', 'it', 'in', 'a', 'very', 'bad', 'package', '.', 'which', 'is', 'what', 'makes', 'this', 'review', 'an', 'even', 'harder', 'one', 'to', 'write', ',', 'since', 'i', 'generally', 'applaud', 'films', 'which', 'attempt', 'to', 'break', 'the', 'mold', ',', 'mess', 'with', 'your', 'head', 'and', 'such', '(', 'lost', 'highway', '&', 'memento', ')', ',', 'but', 'there', 'are', 'good', 'and', 'bad', 'ways', 'of', 'making', 'all', 'types', 'of', 'films', ',', 'and', 'these', 'folks', 'just', 'didn', "'", 't', 'snag', 'this', 'one', 'correctly', '.', 'they', 'seem', 'to', 'have', 'taken', 'this', 'pretty', 'neat', 'concept', ',', 'but', 'executed', 'it', 'terribly', '.', 'so', 'what', 'are', 'the', 'problems', 'with', 'the', 'movie', '?', 'well', ',', 'its', 'main', 'problem', 'is', 'that', 'it', "'", 's', 'simply', 'too', 'jumbled', '.', 'it', 'starts', 'off', '"', 'normal', '"', 'but', 'then', 'downshifts', 'into', 'this', '"', 'fantasy', '"', 'world', 'in', 'which', 'you', ',', 'as', 'an', 'audience', 'member', ',', 'have', 'no', 'idea', 'what', "'", 's', 'going', 'on', '.', 'there', 'are', 'dreams', ',', 'there', 'are', 'characters', 'coming', 'back', 'from', 'the', 'dead', ',', 'there', 'are', 'others', 'who', 'look', 'like', 'the', 'dead', ',', 'there', 'are', 'strange', 'apparitions', ',', 'there', 'are', 'disappearances', ',', 'there', 'are', 'a', 'looooot', 'of', 'chase', 'scenes', ',', 'there', 'are', 'tons', 'of', 'weird', 'things', 'that', 'happen', ',', 'and', 'most', 'of', 'it', 'is', 'simply', 'not', 'explained', '.', 'now', 'i', 'personally', 'don', "'", 't', 'mind', 'trying', 'to', 'unravel', 'a', 'film', 'every', 'now', 'and', 'then', ',', 'but', 'when', 'all', 'it', 'does', 'is', 'give', 'me', 'the', 'same', 'clue', 'over', 'and', 'over', 'again', ',', 'i', 'get', 'kind', 'of', 'fed', 'up', 'after', 'a', 'while', ',', 'which', 'is', 'this', 'film', "'", 's', 'biggest', 'problem', '.', 'it', "'", 's', 'obviously', 'got', 'this', 'big', 'secret', 'to', 'hide', ',', 'but', 'it', 'seems', 'to', 'want', 'to', 'hide', 'it', 'completely', 'until', 'its', 'final', 'five', 'minutes', '.', 'and', 'do', 'they', 'make', 'things', 'entertaining', ',', 'thrilling', 'or', 'even', 'engaging', ',', 'in', 'the', 'meantime', '?', 'not', 'really', '.', 'the', 'sad', 'part', 'is', 'that', 'the', 'arrow', 'and', 'i', 'both', 'dig', 'on', 'flicks', 'like', 'this', ',', 'so', 'we', 'actually', 'figured', 'most', 'of', 'it', 'out', 'by', 'the', 'half', '-', 'way', 'point', ',', 'so', 'all', 'of', 'the', 'strangeness', 'after', 'that', 'did', 'start', 'to', 'make', 'a', 'little', 'bit', 'of', 'sense', ',', 'but', 'it', 'still', 'didn', "'", 't', 'the', 'make', 'the', 'film', 'all', 'that', 'more', 'entertaining', '.', 'i', 'guess', 'the', 'bottom', 'line', 'with', 'movies', 'like', 'this', 'is', 'that', 'you', 'should', 'always', 'make', 'sure', 'that', 'the', 'audience', 'is', '"', 'into', 'it', '"', 'even', 'before', 'they', 'are', 'given', 'the', 'secret', 'password', 'to', 'enter', 'your', 'world', 'of', 'understanding', '.', 'i', 'mean', ',', 'showing', 'melissa', 'sagemiller', 'running', 'away', 'from', 'visions', 'for', 'about', '20', 'minutes', 'throughout', 'the', 'movie', 'is', 'just', 'plain', 'lazy', '!', '!', 'okay', ',', 'we', 'get', 'it', '.', '.', '.', 'there', 'are', 'people', 'chasing', 'her', 'and', 'we', 'don', "'", 't', 'know', 'who', 'they', 'are', '.', 'do', 'we', 'really', 'need', 'to', 'see', 'it', 'over', 'and', 'over', 'again', '?', 'how', 'about', 'giving', 'us', 'different', 'scenes', 'offering', 'further', 'insight', 'into', 'all', 'of', 'the', 'strangeness', 'going', 'down', 'in', 'the', 'movie', '?', 'apparently', ',', 'the', 'studio', 'took', 'this', 'film', 'away', 'from', 'its', 'director', 'and', 'chopped', 'it', 'up', 'themselves', ',', 'and', 'it', 'shows', '.', 'there', 'might', "'", 've', 'been', 'a', 'pretty', 'decent', 'teen', 'mind', '-', 'f**k', 'movie', 'in', 'here', 'somewhere', ',', 'but', 'i', 'guess', '"', 'the', 'suits', '"', 'decided', 'that', 'turning', 'it', 'into', 'a', 'music', 'video', 'with', 'little', 'edge', ',', 'would', 'make', 'more', 'sense', '.', 'the', 'actors', 'are', 'pretty', 'good', 'for', 'the', 'most', 'part', ',', 'although', 'wes', 'bentley', 'just', 'seemed', 'to', 'be', 'playing', 'the', 'exact', 'same', 'character', 'that', 'he', 'did', 'in', 'american', 'beauty', ',', 'only', 'in', 'a', 'new', 'neighborhood', '.', 'but', 'my', 'biggest', 'kudos', 'go', 'out', 'to', 'sagemiller', ',', 'who', 'holds', 'her', 'own', 'throughout', 'the', 'entire', 'film', ',', 'and', 'actually', 'has', 'you', 'feeling', 'her', 'character', "'", 's', 'unraveling', '.', 'overall', ',', 'the', 'film', 'doesn', "'", 't', 'stick', 'because', 'it', 'doesn', "'", 't', 'entertain', ',', 'it', "'", 's', 'confusing', ',', 'it', 'rarely', 'excites', 'and', 'it', 'feels', 'pretty', 'redundant', 'for', 'most', 'of', 'its', 'runtime', ',', 'despite', 'a', 'pretty', 'cool', 'ending', 'and', 'explanation', 'to', 'all', 'of', 'the', 'craziness', 'that', 'came', 'before', 'it', '.', 'oh', ',', 'and', 'by', 'the', 'way', ',', 'this', 'is', 'not', 'a', 'horror', 'or', 'teen', 'slasher', 'flick', '.', '.', '.', 'it', "'", 's', 'just', 'packaged', 'to', 'look', 'that', 'way', 'because', 'someone', 'is', 'apparently', 'assuming', 'that', 'the', 'genre', 'is', 'still', 'hot', 'with', 'the', 'kids', '.', 'it', 'also', 'wrapped', 'production', 'two', 'years', 'ago', 'and', 'has', 'been', 'sitting', 'on', 'the', 'shelves', 'ever', 'since', '.', 'whatever', '.', '.', '.', 'skip', 'it', '!', 'where', "'", 's', 'joblo', 'coming', 'from', '?', 'a', 'nightmare', 'of', 'elm', 'street', '3', '(', '7', '/', '10', ')', '-', 'blair', 'witch', '2', '(', '7', '/', '10', ')', '-', 'the', 'crow', '(', '9', '/', '10', ')', '-', 'the', 'crow', ':', 'salvation', '(', '4', '/', '10', ')', '-', 'lost', 'highway', '(', '10', '/', '10', ')', '-', 'memento', '(', '10', '/', '10', ')', '-', 'the', 'others', '(', '9', '/', '10', ')', '-', 'stir', 'of', 'echoes', '(', '8', '/', '10', ')'] >>> print (documents[0]) (['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], 'neg') >>> from random import shuffle >>> shuffle(documents) #shuffle the document list >>>