def __init__(self, pos=None, neg=None): if not pos: # self.__pos = [open(f).read() for f in glob('review_polarity/txt_sentoken/pos/*.txt')] self.__pos = [ movie_reviews.raw(file) for file in movie_reviews.fileids('pos') ] else: self.__pos = pos if not neg: # self.__neg = [open(f).read() for f in glob('review_polarity/txt_sentoken/neg/*.txt')] self.__neg = [ movie_reviews.raw(file) for file in movie_reviews.fileids('neg') ] else: self.__neg = neg if os.path.isfile('classifier.pickle'): # Load the features with open('classifier.pickle', 'rb') as f: self.__classifier = pickle.load(f) else: # Train a data set self.__classifier = nltk.NaiveBayesClassifier.train( self.__train_data()) # Cache the features for faster predictions with open('classifier.pickle', 'wb') as f: pickle.dump(self.__classifier, f)
def get_sentilyzer(): """ train and return the sentiment analyzer """ from nltk.corpus import movie_reviews try: classifier = pickle.load(open("data/sentilyzer.pickle")) return classifier except: print "Unable to load sentilyzer, so training it again" negids = movie_reviews.fileids('neg') posids = movie_reviews.fileids('pos') negfeats = [(extract_features(movie_reviews.raw(fileids=[f])), 'neg') for f in negids] posfeats = [(extract_features(movie_reviews.raw(fileids=[f])), 'pos') for f in posids] print "Length of Negative Features", len(negfeats) print "Length of Positive Features", len(posfeats) negcutoff = int(len(negfeats) * 3 / 4) poscutoff = int(len(posfeats) * 3 / 4) trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] print 'train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats)) classifier = nltk.NaiveBayesClassifier.train(trainfeats) print 'accuracy of sentiment analysis:', nltk.classify.util.accuracy(classifier, testfeats) pickle.dump(classifier, open("data/sentilyzer.pickle", "w")) return classifier
def load_reviews(): """ Load movie reviews from nltk and split into train, dev and test.""" negids = movie_reviews.fileids('neg') posids = movie_reviews.fileids('pos') reviews = [TextDocument(movie_reviews.raw(fileids=[id]), id, 1) for id in posids] + \ [TextDocument(movie_reviews.raw(fileids=[id]), id, -1) for id in negids] # Get reproducible data split by setting a deterministic seed for the random number generator. """random. random ( ) Return the next random floating point number in the range [0.0, 1.0). """ #????????? random.Random(0).shuffle(reviews) #Random(0) # First 60% of data is for training. start_dev = int( 0.6 * len(reviews)) #integervalue: um die position von start von dev # Next 20% is for development (hyper-parameter tuning). start_test = int(0.8 * len(reviews)) # ... last 20% are for testing. training_collection = DocumentCollection.from_document_list( reviews[:start_dev]) dev_collection = DocumentCollection.from_document_list( reviews[start_dev:start_test]) test_collection = DocumentCollection.from_document_list( reviews[start_test:]) return training_collection, dev_collection, test_collection
def nltk_data(n_texts_train=1500, n_texts_dev=500, vocab_size=10000): """ Reads texts from the nltk movie_reviews corpus. A word2id dictionary is created and the words in the texts are substituted with their numbers. Training and Development data is returned, together with labels and the word2id dictionary. :param n_texts_train: the number of reviews that will form the training data :param n_texts_dev: the number of reviews that will form the development data :param vocab_size: the maximum size of the vocabulary. :return list texts_train: A list containing lists of wordids corresponding to training texts. :return list texts_dev: A list containing lists of wordids corresponding to development texts. :return labels_train: A list containing the labels (0 or 1) for the corresponding text entry in texts_train :return labels_dev: A ilst containing the labels (0 or 1) for the corresponding text entry in texts_dev :return word2id: The dictionary obtained from the training texts that maps each seen word to an id. """ all_ids = movie_reviews.fileids() if (n_texts_train + n_texts_dev > len(all_ids)): print( "Error: There are only", len(all_ids), "texts in the movie_reviews corpus. Training with all of those sentences." ) n_texts_train = 1500 n_texts_dev = 500 posids = movie_reviews.fileids('pos') random.shuffle(all_ids) texts_train = [] labels_train = [] texts_dev = [] labels_dev = [] for i in range(n_texts_train): text = movie_reviews.raw(fileids=[all_ids[i]]) tokens = [word.lower() for word in word_tokenize(text)] texts_train.append(tokens) if all_ids[i] in posids: labels_train.append(1) else: labels_train.append(0) for i in range(n_texts_train, n_texts_train + n_texts_dev): text = movie_reviews.raw(fileids=[all_ids[i]]) tokens = [word.lower() for word in word_tokenize(text)] texts_dev.append(tokens) if all_ids[i] in posids: labels_dev.append(1) else: labels_dev.append(0) word2id = create_dictionary(texts_train, vocab_size) texts_train = [to_ids(s, word2id) for s in texts_train] texts_dev = [to_ids(s, word2id) for s in texts_dev] return (texts_train, labels_train, texts_dev, labels_dev, word2id)
def to_arr(): neg_indexes = movie_reviews.fileids('neg') pos_indexes = movie_reviews.fileids('pos') neg_reviews = [movie_reviews.raw(fileids=ids) for ids in neg_indexes] pos_reviews = [movie_reviews.raw(fileids=ids) for ids in pos_indexes] list_of_text = stemmer(neg_reviews + pos_reviews) cv = CountVectorizer() word_arr = cv.fit_transform(list_of_text).toarray() bin_arr = np.array([0] * len(neg_reviews) + [1] * len(pos_indexes)) return (word_arr, bin_arr)
def f_i_and_c(): neg_file_ids = movie_reviews.fileids('neg') pos_file_ids = movie_reviews.fileids('pos') neg_reviews = [movie_reviews.raw(fileids=ids) for ids in neg_file_ids] pos_reviews = [movie_reviews.raw(fileids=ids) for ids in pos_file_ids] list_of_text = stem_list_of_text(neg_reviews + pos_reviews) cv = CountVectorizer() f_i = cv.fit_transform(list_of_text).toarray() c = np.array([0] * len(neg_reviews) + [1] * len(pos_file_ids)) return (f_i, c)
def opinion_features(fileid): """ starter feature engineering for movie reviews... """ # many features are counts! positive_count = 0 negative_count = 0 exclaim_count = 0 for word in movie_reviews.words(fileid): if word in pos_set: positive_count += 1 if word in neg_set: negative_count += 1 #''' for word in movie_reviews.words(fileid): for x in range(len(word)): if word[x] == '!': exclaim_count += 1 asum = 0 aword = 0 for word in movie_reviews.words(fileid): asum += len(word) aword += 1 avgwordlen = asum / aword subjectivity = 0 rawtext = movie_reviews.raw(fileid) blob = textblob.TextBlob(rawtext) for x in blob.sentences: subjectivity += x.sentiment.subjectivity #''' # Note: movie_reviews.raw(fileid) is the whole review! # these next two lines will create # a TextBlob with all of the text from the review: rawtext = movie_reviews.raw(fileid) TB = textblob.TextBlob(rawtext) # from here, you can use TB.words and TB.sentences... # here is the dictionary of features... features = {} # we could also use a default dictionary! features['positive'] = positive_count features['negative'] = negative_count #''' features['exclaimation mark'] = exclaim_count features['average word length'] = avgwordlen features['number of words'] = aword features['subjectivity'] = subjectivity #''' return features
def load_movie_documents(): """ Returns a list where each entry is a pair (cat, [token, ...]) """ pairs = [] for fileid in nltk.corpus.movie_reviews.fileids(): # get the raw text text = movie_reviews.raw(fileid) category = "pos" if fileid.startswith("pos/") else "neg" document = [] # We can just split the string on spaces, since it's already been # preprocessed. document = text.split() # The on-disk data has already been sentence split and tokenized! # So we don't need to do any of this! But if we got raw normal text, we # might want to do something like the following. ## # get all the sentences ## for sentence in nltk.sent_tokenize(text): ## tokens = nltk.word_tokenize(sentence) ## # now we have a list of tokens in "tokens" ## document.extend(tokens) pairs.append((category, document)) return pairs
def test_classify_movies(self): """Classify movie reviews from nltk corpus""" from nltk.corpus import movie_reviews docs = [Document(raw_text=movie_reviews.raw(fileid), class_label=category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] print docs
def get_movie_reviews(cls): documents = [(category, movie_reviews.raw(fileid)) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] random.shuffle(documents) l, d = zip(*documents) return l, d
def read_reviews(): documents = [] for file_id in movie_reviews.fileids(): documents.append(movie_reviews.raw(file_id)) return documents
def main(): vectorizer = TfidfVectorizer( use_idf=True, # norm=None, # smooth_idf=False, # sublinear_tf=False, # binary=False, # min_df=1, # max_df=1.0, max_features=None, # strip_accents='unicode', # ngram_range=(1,1), preprocessor=None, stop_words='english', tokenizer=None, vocabulary=None) # lots of options to play around with.few useful options I found were norm, min_df and max_df. for type in movie_reviews.categories(): # only 2 categories : 'pos' and 'neg' type_ids = movie_reviews.fileids(type) X = vectorizer.fit_transform( list(movie_reviews.raw(t) for t in type_ids)) idf = vectorizer.idf_ # once we get weights, we just arrange it in decreasing sort wts = dict(zip(vectorizer.get_feature_names(), idf)) s_wts = [(k, wts[k]) for k in sorted(wts, key=wts.get, reverse=True)] for key, val in s_wts[:10]: print(type, key, val)
def prep_reviews_data(self): # messy code to test classifier with movie reviews if not self.movie_review_data: print 'Preparing movie reviews...\n' from nltk.corpus import movie_reviews docs = [movie_reviews.raw(fileid) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] process = lambda x: 1 if x == 'pos' else -1 labels = [process(category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] docs, labels = double_shuffle(docs, labels) training, testing = divide_list_by_ratio(docs) self.train_labs, self.test_labs = divide_list_by_ratio(labels) train_vecs = self.vectorizer.fit_transform(training) test_vecs = self.vectorizer.transform(testing) if isinstance(self.model, naive_bayes.GaussianNB): train_vecs = train_vecs.toarray() test_vecs = test_vecs.toarray() self.train_vecs = train_vecs self.test_vecs = test_vecs self.movie_review_data = True self.news_market_data = False
def get_documents(): """ Get documents from 20 News Groups, Movie Reviews and Reuters corpora. Returns: list of str: Small subset of documents from News Groups, Movie Reviews and Reuters corpora """ dataset = fetch_20newsgroups(subset='all', shuffle=True, remove=('headers', 'footers', 'quotes')) corpus_20newsgroups = dataset.data[:5] tuples = [(movie_reviews.raw(fileid), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] corpus_movies = [tuple_[0] for tuple_ in tuples] shuffle(corpus_movies) corpus_movies = corpus_movies[:5] tuples = [(reuters.raw(fileid), reuters.categories(fileid)) for fileid in reuters.fileids()] corpus_reuters = [tuple_[0] for tuple_ in tuples] shuffle(corpus_reuters) corpus_reuters = corpus_reuters[:5] corpus = list() corpus.extend(corpus_20newsgroups) corpus.extend(corpus_movies) corpus.extend(corpus_reuters) return corpus
def read_reviews(): documents = [] for category in movie_reviews.categories(): for fileid in movie_reviews.fileids(category): documents.append((movie_reviews.words(fileid), category, movie_reviews.raw(fileid))) return documents
def __init__(self, pos=None, neg=None): if not pos: # self.__pos = [open(f).read() for f in glob('review_polarity/txt_sentoken/pos/*.txt')] self.__pos = [ movie_reviews.raw(file) for file in movie_reviews.fileids('pos') ] else: self.__pos = pos if not neg: # self.__neg = [open(f).read() for f in glob('review_polarity/txt_sentoken/neg/*.txt')] self.__neg = [ movie_reviews.raw(file) for file in movie_reviews.fileids('neg') ] else: self.__neg = neg
def get_labeled_dataset(): dataset = [] for label in movie_reviews.categories(): for review in movie_reviews.fileids(label): dataset.append((movie_reviews.raw(review), label)) random.shuffle(dataset) return dataset
def opinion_features(fileid): """ starter feature engineering for movie reviews... """ rawtext = movie_reviews.raw(fileid) TB = textblob.TextBlob(rawtext) total_words = len(TB.words) total_sentence = len(TB.sentences) positive_count = 0 negative_count = 0 for i in range(len(TB.words)): if TB.words[i] in pos_set: if TB.words[i - 1] in [ 'not', 'less', 'few', "isn't", "hasn't", "wasn't" ] or TB.words[i - 2] == 'not': negative_count += 1 else: positive_count += 1 elif TB.words[i] in neg_set: if TB.words[i - 1] in [ 'not', 'less', 'few', "isn't", "hasn't", "wasn't" ] or TB.words[i - 2] == 'not': positive_count += 1 else: negative_count += 1 # Note: movie_reviews.raw(fileid) is the whole review! # create a TextBlob with rawtext = movie_reviews.raw(fileid) TB = textblob.TextBlob(rawtext) # now, you can use TB.words and TB.sentences... # here is the dictionary of features... features = {} # could also use a default dictionary! features['positive'] = positive_count features['negative'] = negative_count features['total words'] = total_words features['total sentence'] = total_sentence features['sentimence'] = TB.sentiment.subjectivity features['polarity'] = TB.sentiment.polarity #features['negative_r'] = negative_count//total_words #features['positive_r'] = positive_count//total_words return features
def read_reviews(): documents = [] labels = [] for file_id in movie_reviews.fileids(): documents.append(movie_reviews.raw(file_id)) labels.append(movie_reviews.categories(file_id)[0]) return documents, labels
def get_raw_text(corpus,file_name): string='' if corpus=='mr': from nltk.corpus import movie_reviews string = movie_reviews.raw(fileids=file_name) else: from nltk.corpus import reuters string = reuters.raw(fileids=file_name) return string
def loadData(): print('loading the dataset') dataset = [(list(word_tokenize(movie_reviews.raw(fileid))), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] print('loading is completed') return dataset
def get_raw_text(corpus, file_name): string = '' if corpus == 'mr': from nltk.corpus import movie_reviews string = movie_reviews.raw(fileids=file_name) else: from nltk.corpus import reuters string = reuters.raw(fileids=file_name) return string
def get_list_tokens_nltk(corpus, file_name): string='' if corpus=='mr': from nltk.corpus import movie_reviews string = movie_reviews.raw(fileids=file_name) else: from nltk.corpus import reuters string = reuters.raw(fileids=file_name) list_words = re.split(r'\W+',string) return [w.lower() for w in list_words if w.isalpha() and len(w)>1 and w.lower() not in stopwords_english]
def loadReview(f, sw): words = {} review = movie_reviews.raw(f) #load the movie review for w in review.split(): #for each word in the review if w in sw: #If the word is one of our stop words if w in words: words[w] = words[w] + 1 else: words[w] = 1 return words
def create_bunch(col): bunch_data = [] bunch_target = [] for f in col: bunch_data.append(movie_reviews.raw(f)) if (movie_reviews.categories(f)[0] == 'pos'): cat = 1 else: cat = 0 bunch_target.append(cat) return sklearn.utils.Bunch(data=bunch_data, target=bunch_target)
def get_model(self): filename = 'dump.pkl' if os.path.isfile(filename): with open(filename, 'rb') as f: model = pickle.load(f) else: X = [reviews.raw(fileid) for fileid in reviews.fileids()] y = [reviews.categories(fileid)[0] for fileid in reviews.fileids()] model = build_and_evaluate(X, y, outpath=filename) return model
def load_moview_reviews(shuffle=True): X = [movie_reviews.raw(fileid) for fileid in movie_reviews.fileids()] y = [ movie_reviews.categories(fileid)[0] for fileid in movie_reviews.fileids() ] Xy = zip(X, y) if shuffle: random.shuffle(Xy, ) else: random.shuffle(Xy, lambda: 0.42) return [x[0] for x in Xy], [x[1] for x in Xy]
def main(): """ Sample training using the movie reviews corpus (Pang, Lee). """ #== load inputs documents = np.array([ movie_reviews.raw(review_id) for category in movie_reviews.categories() for review_id in movie_reviews.fileids(category) ]) sentiment_scores = np.array([ 0 if category == 'neg' else 1 for category in movie_reviews.categories() for review_id in movie_reviews.fileids(category) ]) #== select random indices n = documents.shape[0] indices = np.random.permutation(n) threshold = np.floor(n * 0.8) # 80% training set / 20% test set train_idx, test_idx = indices[:threshold], indices[threshold:] #== select training and validation sets according to these indicies x_train, x_test = documents[:, train_idx], documents[:, test_idx] y_train, y_test = sentiment_scores[:, train_idx], sentiment_scores[:, test_idx] #== train the model print '===== Training the model...' sentiment = SentimentMachine(x_train.tolist(), y_train.tolist()) w = sentiment.train(speed=0.001, stochastic=False) print '===== Model trained.' #== test efficiency of the model print '===== Testing the model...' # compute the MSE h = lambda a, b: sigmoid(np.dot(a, b)) x = sentiment.compute_features_matrix(x_test.tolist()) mse = cost(w, x, y_test, h) # compute the number of valid classifications n_test = y_test.shape[0] valid = 0 for i in xrange(n_test): valid += 1 if sentiment.classify(x_test[i]) == y_test[i] else 0 percent = 100.0 * valid / n_test # print results print('== Number of well-classified documents: {0} / {1} ({2}%)'.format( valid, n_test, percent)) print '== Cost value on the test set: %.4f' % mse
def get_list_tokens_nltk(corpus, file_name): string = '' if corpus == 'mr': from nltk.corpus import movie_reviews string = movie_reviews.raw(fileids=file_name) else: from nltk.corpus import reuters string = reuters.raw(fileids=file_name) list_words = re.split(r'\W+', string) return [ w.lower() for w in list_words if w.isalpha() and len(w) > 1 and w.lower() not in stopwords_english ]
def add_document_to_corpus(self, filename=None): if filename: inp_file = open(filename, 'r') else: inp_file = movie_reviews.raw() words = self.get_tokens(inp_file) words_freq = Counter(words) self.num_words += len(words_freq) for word in words_freq: if word in self.term_freq: self.term_freq[word] += words_freq[word] else: self.term_freq[word] = words_freq[word]
def movieReviews(self, category, count): ret = [] if category != 'positive' and category != 'negative': return ret fileids = [] if category == 'positive': fileids = movie_reviews.fileids('pos') elif category == 'negative': fileids = movie_reviews.fileids('neg') sampleFileIds = sample(fileids, count) for sampleFileId in sampleFileIds: ret.append(movie_reviews.raw(sampleFileId)) return ret
def add_document_to_corpus(self, filename = None): if filename: inp_file = open(filename,'r') else: inp_file = movie_reviews.raw() words = self.get_tokens(inp_file) words_freq = Counter(words) self.num_words += len(words_freq) for word in words_freq: if word in self.term_freq: self.term_freq[word] += words_freq[word] else: self.term_freq[word] = words_freq[word]
def data_preprocessed(self): syms = '[]()"' data = movie_reviews.raw().lower() for sym in list(syms) + ['...']: data = data.replace(sym, '') corpus = data[:self.data_size] self.vocab = set(self.tokens_filter(corpus) + [BEGIN, END]) self.vocab_len = len(self.vocab) + 1 self.token2id = {t: i for i, t in enumerate(self.vocab, start=1)} self.id2token = {i: t for i, t in enumerate(self.vocab, start=1)} return corpus
def load_data_and_labels(): # Load data from files # positive_examples = list(open("./data/rt-polaritydata/rt-polarity.pos", "r").readlines()) # positive_examples = [s.strip() for s in positive_examples] positive_examples = movie_reviews.raw(fileid for fileid in movie_reviews.fileids('pos')) positive_examples = sent_tokenize(positive_examples) # negative_examples = list(open("./data/rt-polaritydata/rt-polarity.neg", "r").readlines()) # negative_examples = [s.strip() for s in negative_examples] negative_examples = movie_reviews.raw(fileid for fileid in movie_reviews.fileids('neg')) negative_examples = sent_tokenize(negative_examples) # Split by words x_text = positive_examples + negative_examples x_text = [clean_str(sent) for sent in x_text] x_text = [s.split(" ") for s in x_text] # Generate labels positive_labels = [[0, 1] for _ in positive_examples] negative_labels = [[1, 0] for _ in negative_examples] y = np.concatenate([positive_labels, negative_labels], 0) return [x_text, y]
def load_movie_documents(): """ Returns a list where each entry is a pair (cat, [token, ...]) """ pairs = [] for fileid in nltk.corpus.movie_reviews.fileids(): # get the raw text text = movie_reviews.raw(fileid) category = "pos" if fileid.startswith("pos/") else "neg" document = [] # We can just split the string on spaces, since it's already been # preprocessed. document = text.split() pairs.append((category, document)) return pairs
def main(): """ Sample training using the movie reviews corpus (Pang, Lee). """ #== load inputs documents = np.array([movie_reviews.raw(review_id) for category in movie_reviews.categories() for review_id in movie_reviews.fileids(category)]) sentiment_scores = np.array([0 if category == 'neg' else 1 for category in movie_reviews.categories() for review_id in movie_reviews.fileids(category)]) #== select random indices n = documents.shape[0] indices = np.random.permutation(n) threshold = np.floor(n*0.8) # 80% training set / 20% test set train_idx, test_idx = indices[:threshold], indices[threshold:] #== select training and validation sets according to these indicies x_train, x_test = documents[:, train_idx], documents[:, test_idx] y_train, y_test = sentiment_scores[:, train_idx], sentiment_scores[:, test_idx] #== train the model print '===== Training the model...' sentiment = SentimentMachine(x_train.tolist(), y_train.tolist()) w = sentiment.train(speed=0.001, stochastic=False) print '===== Model trained.' #== test efficiency of the model print '===== Testing the model...' # compute the MSE h = lambda a,b: sigmoid(np.dot(a,b)) x = sentiment.compute_features_matrix(x_test.tolist()) mse = cost(w, x, y_test, h) # compute the number of valid classifications n_test = y_test.shape[0] valid = 0 for i in xrange(n_test): valid += 1 if sentiment.classify(x_test[i]) == y_test[i] else 0 percent = 100.0 * valid / n_test # print results print ('== Number of well-classified documents: {0} / {1} ({2}%)' .format(valid, n_test, percent)) print '== Cost value on the test set: %.4f' % mse
__author__ = 'a_medelyan' # Goal: Get movie reviews and read them # See: http://www.nltk.org/book/ch02.html from nltk.corpus import movie_reviews # How many documents in this corpus? print len(movie_reviews.fileids()) # What are the categories? print movie_reviews.categories() # What are some files names? print movie_reviews.fileids('neg')[:10] print movie_reviews.fileids('pos')[:10] # Print the words in a sample text print movie_reviews.words('pos/cv000_29590.txt') # Print the original text print movie_reviews.raw('pos/cv000_29590.txt') # Print the sentences of the text print movie_reviews.sents('pos/cv000_29590.txt') # Spare time? Calculate the average number of words and sentences in positive and negative reviews # Do people use a lot more words when giving positive vs. negative reviews?
import collections import nltk.metrics from nltk.classify import NaiveBayesClassifier from nltk.corpus import movie_reviews import numpy as np from sklearn.feature_extraction.text import CountVectorizer from nltk.corpus import stopwords from sklearn.naive_bayes import MultinomialNB stopset = stopwords.words('english') negids = movie_reviews.fileids('neg') posids = movie_reviews.fileids('pos') negdocs = [movie_reviews.raw(f) for f in negids] posdocs = [movie_reviews.raw(f) for f in posids] negtags=[0]*len(negdocs) postags=[1]*len(posdocs) negcutoff = int(len(negdocs)*0.8) poscutoff = int(len(posdocs)*0.8) traindocs = negdocs[:negcutoff] + posdocs[:poscutoff] traintags = negtags[:negcutoff] + postags[:poscutoff] testdocs = negdocs[negcutoff:] + posdocs[poscutoff:] testtags = negtags[negcutoff:] + postags[negcutoff:] print 'train on %d instances, test on %d instances' % (len(traindocs), len(testdocs)) vectorizer = CountVectorizer(min_df=1, binary=True, stop_words=stopset)
# score these bigrams best_bigrams = bigram_finder.nbest(score_fn, n) # return boolean mapping, as before return dict([(ngram, True) for ngram in itertools.chain(words, best_bigrams)]) if __name__ == '__main__': # load corpus, will be different for corpora not included in nltk neg_ids = movie_reviews.fileids('neg') pos_ids = movie_reviews.fileids('pos') # bag-of-words features neg_feats = [(word_feats(movie_reviews.raw(fileids=[f])), 'neg') for f in neg_ids] pos_feats = [(word_feats(movie_reviews.raw(fileids=[f])), 'pos') for f in pos_ids] # uncomment for bag-of-bigram feats (make sure to comment out the preceding two lines) # neg_feats = [(bigram_feats(movie_reviews.raw(fileids=[f])), 'neg') for f in neg_ids] # pos_feats = [(bigram_feats(movie_reviews.raw(fileids=[f])), 'pos') for f in pos_ids] # create train / test split negcutoff = len(neg_feats)*3/4 poscutoff = len(pos_feats)*3/4 train_feats = neg_feats[:negcutoff] + pos_feats[:poscutoff] test_feats = neg_feats[negcutoff:] + pos_feats[poscutoff:] print 'train on %d instances, test on %d instances' % (len(train_feats), len(test_feats)) # classify
#!/usr/bin/env python import re from nltk.corpus import movie_reviews documents = [] n = 10 for category in movie_reviews.categories(): for fileid in movie_reviews.fileids(category): n -= 1 if n <= 0: break documents.append(movie_reviews.raw(fileid)) # print documents[-1] for doc in documents: patterns = re.findall("^(t.*\')", doc) if len(patterns) != 0: print patterns[0]
print blob.sentiment blob = TextBlob("I love this library") print blob.sentiment # Test on new movie reviews transcendence = ['../data/transcendence_1star.txt', '../data/transcendence_5star.txt', '../data/transcendence_8star.txt', '../data/transcendence_great.txt'] # Insert code here # Spare time? Evaluate both ways of determining sentiment. # Also test out various polarity thresholds. correct = 0 for fileid in movie_reviews.fileids(): raw = movie_reviews.raw(fileid) blob = TextBlob(raw) sentiment = blob.sentiment guessed = 'neg' if sentiment.polarity > 0.11: guessed = 'pos' actual = movie_reviews.categories(fileid)[0] if guessed == actual: correct += 1 accuracy = float(correct)/len(movie_reviews.fileids()) print accuracy
vect.fit_transform(sample).toarray() vect.get_feature_names() from sklearn.feature_extraction.text import TfidfVectorizer tfidf = TfidfVectorizer() tfidf.fit_transform(sample).toarray() tfidf.get_feature_names() ''' EXAMPLE: Automatically summarize a document ''' # corpus of 2000 movie reviews from nltk.corpus import movie_reviews reviews = [movie_reviews.raw(filename) for filename in movie_reviews.fileids()] # create document-term matrix tfidf = TfidfVectorizer(stop_words='english') dtm = tfidf.fit_transform(reviews) features = tfidf.get_feature_names() import numpy as np # find the most and least "interesting" sentences in a randomly selected review def summarize(): # choose a random movie review review_id = np.random.randint(0, len(reviews)) review_text = reviews[review_id]
raw_text=re.sub('#', ' ', raw_text) raw_text=re.sub('^https?:\/\/.*[\r\n]*', ' ', raw_text) return raw_text document_set=[] st = LancasterStemmer() from nltk.corpus import movie_reviews for category in movie_reviews.categories(): if category=='pos': cat=True else: cat=False for fileid in movie_reviews.fileids(category): raw_text = movie_reviews.raw(fileid) raw_text = filter_text(raw_text) tokens = nltk.word_tokenize(raw_text) #words=[st.stem(i) for i in list(tokens)] document_set.append((list(tokens), cat)) random.shuffle(document_set) test_set=document_set[0:600] whole_set=document_set document_set=whole_set[0:1400] all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
def evaluate_classifier(featx, dataset, encod=""): if dataset=="movies": negids = movie_reviews.fileids('neg') posids = movie_reviews.fileids('pos') #print movie_reviews.raw(fileids=[negids[0]]) negtexts = [preprocess(movie_reviews.raw(fileids=[f]),'text') for f in negids] posfexts = [preprocess(movie_reviews.raw(fileids=[f]),'text') for f in posids] negfeats = [(featx(movie_reviews.words(fileids=[f])), 'neg') for f in negids] posfeats = [(featx(movie_reviews.words(fileids=[f])), 'pos') for f in posids] negfeats2 = [(preprocess(movie_reviews.raw(fileids=[f]),'dict'), 'neg') for f in negids] posfeats2 = [(preprocess(movie_reviews.raw(fileids=[f]),'dict'), 'pos') for f in posids] Nneg = len(negfeats) Npos = len(posfeats) negcutoff = Nneg*3/4 poscutoff = Npos*3/4 trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] trainfeats2 = negfeats2[:negcutoff] + posfeats2[:poscutoff] testfeats2 = negfeats2[negcutoff:] + posfeats2[poscutoff:] train_data = negtexts[:negcutoff] + posfexts[:poscutoff] train_targets = np.append(np.full_like(np.arange(negcutoff, dtype=np.int),0) , np.full_like(np.arange(poscutoff, dtype=np.int),1)) test_data = negtexts[negcutoff:] + posfexts[poscutoff:] test_targets = np.append(np.full_like(np.arange(Nneg-negcutoff, dtype=np.int),0) , np.full_like(np.arange(Npos-poscutoff, dtype=np.int),1)) elif dataset=="20newsgroups-5": categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med'] twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42) twenty_test = fetch_20newsgroups(subset='test',categories=categories, shuffle=True, random_state=42) train_data = twenty_train.data train_targets = twenty_train.target test_data = twenty_test.data test_targets = twenty_test.target trainfeats = [(featx(preprocess(train_data[i],'words')), train_targets[i]) for i in range(len(train_data))] trainfeats2 = [(preprocess(train_data[i],'dict'), train_targets[i]) for i in range(len(train_data))] testfeats = [(featx(preprocess(test_data[i],'words')), test_targets[i]) for i in range(len(test_data))] testfeats2 = [(preprocess(test_data[i],'dict'), test_targets[i]) for i in range(len(test_data))] elif dataset=="20newsgroups": twenty_train = fetch_20newsgroups(subset='train', shuffle=True, random_state=42) twenty_test = fetch_20newsgroups(subset='test', shuffle=True, random_state=42) train_data = twenty_train.data train_targets = twenty_train.target test_data = twenty_test.data test_targets = twenty_test.target trainfeats = [(featx(preprocess(train_data[i],'words')), train_targets[i]) for i in range(len(train_data))] trainfeats2 = [(preprocess(train_data[i],'dict'), train_targets[i]) for i in range(len(train_data))] testfeats = [(featx(preprocess(test_data[i],'words')), test_targets[i]) for i in range(len(test_data))] testfeats2 = [(preprocess(test_data[i],'dict'), test_targets[i]) for i in range(len(test_data))] else: # Open a file path = dataset cat_dirs = os.listdir( path ) print "Reading corpus from "+path # This would print all the files and directories ncat = 0 train_data = [] train_targets = [] test_data = [] test_targets = [] for category in cat_dirs: print "Reading category: "+category cat_files = os.listdir( path+"/"+category ) temp_data = [] temp_targets = [] #encod = 'utf-8' for filename in cat_files: with io.open(path+"/"+category+"/"+filename, 'r', encoding=encod) as file: content = preprocess(file.read(),"text") temp_data.append(content) temp_targets.append(ncat) cutoff = len(temp_data)*3/4 train_data = train_data + temp_data[:cutoff] train_targets = train_targets + temp_targets[:cutoff] test_data = test_data + temp_data[cutoff:] test_targets = test_targets + temp_targets[cutoff:] ncat+=1 print "Finish reading corpus. " trainfeats = [(featx(preprocess(train_data[i],'words')), train_targets[i]) for i in range(len(train_data))] trainfeats2 = [(preprocess(train_data[i],'dict'), train_targets[i]) for i in range(len(train_data))] testfeats = [(featx(preprocess(test_data[i],'words')), test_targets[i]) for i in range(len(test_data))] testfeats2 = [(preprocess(test_data[i],'dict'), test_targets[i]) for i in range(len(test_data))] #sys.exit() # scikit NB classifier print "Scikit classifier: " count_vect = CountVectorizer() X_train = count_vect.fit_transform(train_data) clf = MultinomialNB().fit(X_train, train_targets) X_new = count_vect.transform(test_data) predicted = clf.predict(X_new) print 'Raw counts accuracy: ',np.mean(predicted == test_targets) tf_transformer = TfidfTransformer(use_idf=False).fit(X_train) X_train_tf = tf_transformer.transform(X_train) clf = MultinomialNB().fit(X_train_tf, train_targets) X_new = tf_transformer.transform(count_vect.transform(test_data)) predicted = clf.predict(X_new) print 'TF accuracy: ',np.mean(predicted == test_targets) tf_transformer = TfidfTransformer().fit(X_train) X_train_tf = tf_transformer.transform(X_train) clf = MultinomialNB().fit(X_train_tf, train_targets) print clf.feature_log_prob_ X_new = tf_transformer.transform(count_vect.transform(test_data)) predicted = clf.predict(X_new) print 'Tfidf accuracy: ',np.mean(predicted == test_targets) # NLTK classifier print "NLTK classifier: " classifier = NaiveBayesClassifier.train(trainfeats2) print 'Raw words accuracy:', nltk.classify.util.accuracy(classifier, testfeats2) classifier = NaiveBayesClassifier.train(trainfeats) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, label) in enumerate(testfeats): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats) #print "--> "+str(classifier)+"\n" #print str(testfeats) print 'pos precision:', nltk.metrics.precision(refsets['pos'], testsets['pos']) print 'pos recall:', nltk.metrics.recall(refsets['pos'], testsets['pos']) print 'neg precision:', nltk.metrics.precision(refsets['neg'], testsets['neg']) print 'neg recall:', nltk.metrics.recall(refsets['neg'], testsets['neg']) classifier.show_most_informative_features()
# <nbformat>3.0</nbformat> # <codecell> import nltk from nltk.corpus import movie_reviews from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer from sklearn.naive_bayes import BernoulliNB, MultinomialNB import numpy as np import random # <codecell> # Get a list of (document text, category) documents = [ (movie_reviews.raw(fileid), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category) ] random.seed(3) random.shuffle(documents) # <codecell> reviewtext, rating = documents[0] print reviewtext print rating # <codecell> train_samples, test_samples = documents[:1000], documents[1000:]
# http://www-rohan.sdsu.edu/~gawron/python_for_ss/course_core/book_draft/text/text_classification.html # We import Bo Pang and Lillian Lee’s movie reviews corpus [PANGLEE2004], which is one of the NLTK corpora. from nltk.corpus import movie_reviews as mr # Use a Naive Bayes Classifier from nltk.classify import NaiveBayesClassifier data = dict(pos = mr.fileids('pos'), neg = mr.fileids('neg')) print mr.readme() # The character by character view uses the raw method: print mr.raw(data['pos'][0])[:100] # The word by word character view uses the words method: print mr.words(data['pos'][0])[:10] def unigram_features (words): return dict((word, True) for word in words) def extract_features (corpus, file_ids, cls, feature_extractor=unigram_features): return [(feature_extractor(corpus.words(i)), cls) for i in file_ids] #### Training # Use 90% of the data for training neg_training = extract_features(mr, data['neg'][:900], 'neg',
from nltk.corpus import gutenberg, abc, reuters, brown, movie_reviews from topia.termextract import extract extractor = extract.TermExtractor() with open('./corpus/all3.txt', 'r') as f: with open('./data/terms.txt', 'w') as o: o.write("Term\tOccurences\tStrength\n") for term in extractor(f.read()+gutenberg.raw()+abc.raw()+reuters.raw()+brown.raw()+movie_reviews.raw()): o.write("\t".join(map(str, term)) + "\n")
def getNEG(): return [normalize(movie_reviews.raw(fileids=file)) for file in movie_reviews.fileids("neg")[200:1000]] + [normalize(movie_reviews.raw(fileids=file)) for file in movie_reviews.fileids("neg")[800:1000]]
def evaluate(smoothingMethod, POSfreqs, NEGfreqs, POStest, NEGtest, name="", terse=False, validate=False): """ Evaluate the accuracy of trained n-gram models (given by frequency lists and a smoothing method) in distinguishing between positive and negative movie reviews. Arguments: - smoothingMethod: a function f(w, freqs) that takes two arguments, a string <w> of length <n> and a frequencyList <freqs> as explained below, and returns the smoothed conditional probability of the n-gram <w>, i.e. Pr(w_n | w_1 .. w_{n-1}) -- or Pr(w[n-1]|w[0:n-1]) in Python indexing - POSfreqs: a list of dictionaries containing n-gram frequency counts for positive movie reviews, which can be used by smoothingMethod to compute conditional n-gram probabilities; it is customary to list unigram frequency counts first, followed by bigrams, etc.; however, any other data structure can be passed provided that it is accepted as a second argument by smoothingMethod - NEGfreqs: same as above, for negative movie reviews - POStest: list of positive reviews (characters strings) used to evaluate the trained n-gram models - NEGtest: same as above, for negative reviews - name: name of the smoothing/interpolation method (for evaluation report) - terse: if True, show compact evaluation report in single line - validate: test whether smoothingMethod produces a valid probability distribution without zeroes """ # for the contest evaluation, we force validation and terse reporting validate = True terse = True # we also use a new "secret" test set, ignoring the parameters POStest and NEGtest # (this should have been held-out data, but by a silly mistake all movie reviews were included in the data set; so we use the first 200 reviews in each set, making sure they aren't part of the training data) POStest = [normalize(movie_reviews.raw(fileids=file)) for file in movie_reviews.fileids("pos")[0:200]] NEGtest = [normalize(movie_reviews.raw(fileids=file)) for file in movie_reviews.fileids("neg")[0:200]] if len(POSfreqs) != len(NEGfreqs): raise Exception("Both lists of n-gram frequency counts (POSfreqs / NEGfreqs) must have the same length!") if len(POStest) != len(NEGtest): raise Exception("Test set must contain same number of positive and negative reviews for a fair evaluation!") n = len(POSfreqs) - 1 # size of n-gram model, with POSfreqs = [corpus size, {unigrams}, {bigrams}, ...] n_guesses = 0 guess = [0, 0, 0] # n-gram classifier: none / pos / neg gold = [0, 0, 0] # gold standard: none / pos / neg correct = [0, 0, 0] # whether classifier is correct: correct / wrong / no decision zeroProb = 0 inconsistent = 0 posCount = len(POStest) negCount = len(NEGtest) for category, test_set in [(1, POStest), (2, NEGtest)]: for review in test_set: review = normalize(review) # make sure that strings are properly normalized ngram = [' '] * n # first n-gram only consists of stop-characters POSlogp = 0 # calculate probabilities on logarithmic scale to avoid underflow NEGlogp = 0 POSzero = False # flag is set if any of the conditional n-gram probabilities is zero (cannot be represented on log scale) NEGzero = False for character in review: ngram.pop(0) ngram.append(character) ngram_string = ''.join(ngram) # convert ngram in fifo to string history = ngram_string[:-1] next_char = ngram_string[-1:] # call smoothingMethod to calculate conditional n-gram probability POSngramp = smoothingMethod(ngram_string, POSfreqs) NEGngramp = smoothingMethod(ngram_string, NEGfreqs) if POSngramp <= 0.0: POSzero = True if validate: print "Error: Pr(%s|%s) = 0 (positive model)" % (next_char, history) else: POSlogp += log(POSngramp) if NEGngramp <= 0.0: NEGzero = True if validate: print "Error: Pr(%s|%s) = 0 (negative model)" % (next_char, history) else: NEGlogp += log(NEGngramp) if validate: POScumprob = 0.0 NEGcumprob = 0.0 for c in alphabet: POScumprob += smoothingMethod(history + c, POSfreqs) NEGcumprob += smoothingMethod(history + c, NEGfreqs) if (abs(POScumprob - 1.0) > 1e-6): print "Error: Sum Pr(*|%s) = %f does not sum to 1 (positive model)" % (history, POScumprob) inconsistent += 1 if (abs(NEGcumprob - 1.0) > 1e-6): print "Error: Sum Pr(*|%s) = %f does not sum to 1 (positive model)" % (history, NEGcumprob) inconsistent += 1 # zero probability flagged: set log(Pr(w)) = -Inf for classifier, issue warning later if POSzero: zeroProb += 1 POSlogp = -9e99 # practically -Inf if NEGzero: zeroProb += 1 NEGlogp = -9e99 # determine classifier decision and check whether it is correct if POSlogp > NEGlogp: classifier = 1 # decision: pos elif POSlogp < NEGlogp: classifier = 2 # decision: neg else: classifier = 0 # no decision (e.g. if both models have Pr(w) = 0) guess[classifier] += 1 gold[category] += 1 n_guesses += 1 if classifier == 0: correct[2] += 1 elif classifier == category: correct[0] += 1 else: correct[1] += 1 accuracy = float(correct[0]) / n_guesses * 100 baseline = float(max(gold)) / n_guesses * 100 if (gold[1] >= gold[2]): majority = "positive" else: majority = "negative" if terse: print 'accuracy:%6.2f%% (%3d/%3d/%3d) %4d positive,%4d negative %s' % (accuracy, correct[0], correct[1], correct[2], guess[1], guess[2], name) else: print ' EVALUATION: %s' % (name) print '='*35 print print 'Correct : %3d reviews' % correct[0] print 'Wrong : %3d reviews' % correct[1] print 'Not classified : %3d reviews' % correct[2] print '-'*35 print 'Accuracy :%6.2f%%' % (accuracy) print 'Baseline :%6.2f%% (always classify as %s)' % (baseline, majority) print if zeroProb > 0: print 'Warning: probability = 0 estimated in %d cases -- need better smoothing!' % zeroProb print if validate and (zeroProb > 0 or inconsistent > 0): error_message = "N-Gram model (%s) failed validation: %d zero probabilities, %d inconsistencies" % (name, zeroProb, inconsistent) raise Exception(error_message)
secure_app_port = 8443 eureka_url = "http://registry:1111/eureka/" np.set_printoptions(precision=8) # Read the data classes = ['pos', 'neg'] train_data = [] train_labels = [] test_data = [] test_labels = [] for curr_class in classes: ids = movie_reviews.fileids(curr_class) for f in ids: train_data.append(movie_reviews.raw(f)) train_labels.append(curr_class) # Create feature vectors vectorizer = TfidfVectorizer(min_df=5, max_df=0.8, sublinear_tf=True, use_idf=True) train_vectors = vectorizer.fit_transform(train_data) # Perform classification with SVM, kernel=rbf classifier_rbf = svm.SVC() t0 = time.time() classifier_rbf.fit(train_vectors, train_labels) t1 = time.time() time_rbf_train = t1 - t0
for reference, predicted, text in zip( reference_labels, predicted_labels, reference_text ): if reference != predicted: fh.write("{0} {1}\n{2}\n\n".format(reference, predicted, text)) fh.close() if __name__ == '__main__': # You have to download the movie reviews first #nltk.download("movie_reviews") reviews = [ (movie_reviews.raw(fid), list(movie_reviews.words(fid)), category) for category in movie_reviews.categories() for fid in movie_reviews.fileids(category) ] #print(reviews) # [(),...(), ("text", ["w1", "w2", .....], "pos")] # Make sure we split the same way every time for the live coding random.seed(0) # Make sure to randomize the reviews first! random.shuffle(reviews) # Convert the data into feature vectors featuresets = [ (features(review_text, review_words), label)