Exemple #1
0
    def prep_reviews_data(self): # messy code to test classifier with movie reviews
        if not self.movie_review_data:
            print 'Preparing movie reviews...\n'
            from nltk.corpus import movie_reviews
            docs = [movie_reviews.raw(fileid) 
                    for category in movie_reviews.categories() 
                    for fileid in movie_reviews.fileids(category)]

            process = lambda x: 1 if x == 'pos' else -1
            labels = [process(category)
                    for category in movie_reviews.categories() 
                    for fileid in movie_reviews.fileids(category)]

            docs, labels = double_shuffle(docs, labels)
            training, testing = divide_list_by_ratio(docs)
            self.train_labs, self.test_labs = divide_list_by_ratio(labels)

            train_vecs = self.vectorizer.fit_transform(training)
            test_vecs = self.vectorizer.transform(testing)

            if isinstance(self.model, naive_bayes.GaussianNB):
                train_vecs = train_vecs.toarray()
                test_vecs = test_vecs.toarray()

            self.train_vecs = train_vecs
            self.test_vecs = test_vecs

            self.movie_review_data = True
            self.news_market_data = False
Exemple #2
0
def download_data_if_not_yet():
    """
    Download the data set, if the data set is not download.
    """
    try:
        # make sure that nltk can find the data
        if paddle.v2.dataset.common.DATA_HOME not in nltk.data.path:
            nltk.data.path.append(paddle.v2.dataset.common.DATA_HOME)
        movie_reviews.categories()
    except LookupError:
        print "Downloading movie_reviews data set, please wait....."
        nltk.download(
            'movie_reviews', download_dir=paddle.v2.dataset.common.DATA_HOME)
        print "Download data set success....."
        print "Path is " + nltk.data.find('corpora/movie_reviews').path
def f2c(corpus,fileName):
    if corpus=='mr':
        from nltk.corpus import movie_reviews as mr
        return mr.categories(fileids = fileName)[0]    
    else:
        from nltk.corpus import reuters
        return reuters.categories(fileids = fileName)[0]    
Exemple #4
0
def category_by_movie():
    from nltk.corpus import movie_reviews as mr
    from nltk import FreqDist
    from nltk import NaiveBayesClassifier
    from nltk import classify
    from nltk.corpus import names
    from nltk.classify import apply_features
    import random

    documents = [(list(mr.words(f)), c) for c in mr.categories() for f in
mr.fileids(c)]
    random.shuffle(documents)

    all_words = FreqDist(w.lower() for w in mr.words())
    word_features = all_words.keys()[:2000]

    def document_features(document):
        document_words = set(document)
        features = {}
        for word in word_features:
            features['contains(%s)' % word] = (word in document_words)
        return features

    #print document_features(mr.words('pos/cv957_8737.txt'))
    #print documents[0]

    features = [(document_features(d), c) for (d, c) in documents]
    train_set, test_set = features[100:], features[:100]
    classifier = NaiveBayesClassifier.train(train_set)
    print classify.accuracy(classifier, train_set)
def create_sentiment():
    """
        Train sentiment model and save.

        Input type: None 
        Output: Model as pickle 
    """

    random.seed(1)

    test = [
        ("The dude presenting Unravel seems like one of the most genuine game developers Ive ever seen I really hope this game works out for him",'pos'),
        ("His hands are shaking Dude looks so stoked and scared at the same time",'pos'),
        ("Right I just felt like I was watching his dream come true It was nice The game looks very well done as well Good for him",'pos'),
        ("Seriously Unravel looks really good actually and honestly seeing him so happy about what hes made is contagious I want to see more of Unravel ",'pos'),
        ("He was so nervous shaking all over his voice quivering",'neg'),
        ("The game looked nice too very cute art style ",'pos'),
        ("You could tell he genuinely wanted to be there it looked like he was even shaking from the excitement  I hope it works out for them aswell",'pos'),
        ("However following that up with the weird PvZ thing was odd To say the least",'neg'),
        ("Haha The game did look nice though Im definitely going to keep an eye on it I enjoy supporting such hopeful developers",'pos'),
        ("Very personable This looks like a buy for me As a dev in a other sector I appreciate this passion",'pos'),
        ("I want to give him a cookie",'pos'),
        ("Im getting a copy Im gonna support my indie devs",'pos'),
        ("The twitch leak was accurate It was like a play by play you start speaking French then switch to English",'neg'),
        ("yep exactly what i was thinking lol its important to note that the twitch leak never had them saying it was Dishonored 2 but that they were honored to be here very different",'neg'),
        ("Honored  Im 100 sure that was intentional",'neg'),
        ("oh yea for sure but wasnt solid enough evidence imo to be like dishonored 2 confirmed just based off that",'neg'),
        ("The confirmation was who was talking not what they were talking about ",'neg'),
        ("How awkward is it for a pop singer to perform at a video game conference",'neg'),
        ("Oh god did they warn him that he will get zero reaction",'neg'),
        ("I really hope so",'pos'),
        ("Almost as bad as Aisha f*****g up her dialogue constantly Shes doing alright though E3 is really becoming a mainstream media event Hollywood has nothing like this ComicCon is the only comparison and they dont dazzle it up like E3",'neg')
        ]


    # Grab review data
    reviews = [
        (list(movie_reviews.words(fileid)), category)
        for category in movie_reviews.categories()
        for fileid in movie_reviews.fileids(category)
        ]
    random.shuffle(reviews)

    # Divide into 10% train/test splits
    new_train, new_test = reviews[:1900], reviews[1900:]

    # Train the NB classifier on the train split
    cl = NaiveBayesClassifier(new_train)

    # Compute accuracy
    accuracy = cl.accuracy(test + new_test)
    print("Accuracy: {0}".format(accuracy))

    # Show 5 most informative features
    cl.show_informative_features(5)

    # Save model for use in creating social model sentiment
    with open('sentiment_clf_full.pkl', 'wb') as pk:
        pickle.dump(cl, pk)
    print 'done saving model'
def documentClassification():

    from nltk.corpus import movie_reviews

    documents = [(list(movie_reviews.words(fileid)), category)
            for category in movie_reviews.categories()
            for fileid in movie_reviews.fileids(category)]

    random.shuffle(documents)

    all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
    word_features = all_words.keys()[:2000]

    def document_features(document):
        document_words = set(document)
        features = {}
        for word in word_features:
            features['contains(%s)' % word] = (word in document_words)
        return features

    print document_features(movie_reviews.words('pos/cv957_8737.txt')) 

    featuresets = [(document_features(d), c) for (d,c) in documents]
    train_set, test_set = featuresets[100:], featuresets[:100]
    classifier = nltk.NaiveBayesClassifier.train(train_set)

    print nltk.classify.accuracy(classifier, test_set)
    classifier.show_most_informative_features(5)
def createFeatureSet(numOfExamples):    
    
    documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)[:numOfExamples]]
    with open('documents.txt', 'wb') as f:        
        pickle.dump(documents, f)

##    #read from file
##    with open('documents.txt', 'rb') as f:
##        documents = pickle.load(f)

    random.shuffle(documents)
    
    all_words = []
##    for w in movie_reviews.words():
##        all_words.append(w.lower())
    #write to file
##    with open('allwords.txt', 'wb') as f:
##        pickle.dump(all_words, f)

    #read from file
    with open('allwords.txt', 'rb') as f:
        all_words = pickle.load(f)

    freqDist = nltk.FreqDist(all_words)
    #print('freq dist')
    #print(freqDist.most_common(50))

    word_features = freqDist.most_common(3000)

    featuresets = [(find_features(rev, word_features), category) for (rev, category) in documents]
    return featuresets
def prepareSentimentClassifier():

	documents = [(list(movie_reviews.words(fileid)), category)
		for category in movie_reviews.categories()
		for fileid in movie_reviews.fileids(category)]

	random.shuffle(documents)

	all_words = []
	for w in movie_reviews.words():
	    all_words.append(w.lower())

	all_words = nltk.FreqDist(all_words)
	
	global word_featuresSent
	word_featuresSent = list(all_words.keys())[:3000]

	featuresets = [(findFeaturesSentiment(rev), category) for (rev, category) in documents]
	
	training_set = featuresets[:1900]
	testing_set = featuresets[1900:]

	sentimentClassifier = nltk.NaiveBayesClassifier.train(training_set)

	print("Classifier accuracy percent:",(nltk.classify.accuracy(sentimentClassifier, testing_set))*100)

	return sentimentClassifier
def create_dataset():
    '''Create dataset from movie reviews dataset'''
    documents = [(list(movie_reviews.words(fileid)), category)
                for category in movie_reviews.categories()
                for fileid in movie_reviews.fileids(category)]
    random.shuffle(documents)
    return documents
def main():
	documents = [(list(movie_reviews.words(fileid)),category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)]
	random.shuffle(documents)
	featuresets = [(document_features8b(d), c) for (d,c) in documents]
	train_set, test_set = featuresets[100:], featuresets[:100]
	classifier = nltk.NaiveBayesClassifier.train(train_set)
	print nltk.classify.accuracy(classifier, test_set)
Exemple #11
0
    def __init__(self):
        self.documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]
        random.shuffle(self.documents)

        all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
        word_features = all_words.keys()[:2000]
Exemple #12
0
def data_run():
    # print "Preparing Data..."
    labels = movie_reviews.categories()
    labeled_words = [(l, movie_reviews.words(categories=[l])) for l in labels]
    high_info_words = set(Params.high_information_words(labeled_words))
    feat_det = lambda words: Params.bag_of_words_in_set(words, high_info_words)
    feats = Train.label_feat_from_corps(movie_reviews, feature_detector=feat_det)
    return Train.split_label_feats(feats)
Exemple #13
0
def label_docs():
    docs = [(list(movie_reviews.words(fid)), cat)
            for cat in movie_reviews.categories()
            for fid in movie_reviews.fileids(cat)]
    random.seed(42)
    random.shuffle(docs)

    return docs
Exemple #14
0
def main():
    """
    Sample training using the movie reviews corpus (Pang, Lee).
    """

    #== load inputs
    documents = np.array([movie_reviews.raw(review_id) 
        for category in movie_reviews.categories() 
        for review_id in movie_reviews.fileids(category)])

    sentiment_scores = np.array([0 if category == 'neg' else 1 
        for category in movie_reviews.categories() 
        for review_id in movie_reviews.fileids(category)])

    #== select random indices
    n = documents.shape[0]
    indices = np.random.permutation(n)
    threshold = np.floor(n*0.8) # 80% training set / 20% test set
    train_idx, test_idx = indices[:threshold], indices[threshold:]

    #== select training and validation sets according to these indicies
    x_train, x_test = documents[:, train_idx], documents[:, test_idx]
    y_train, y_test = sentiment_scores[:, train_idx], sentiment_scores[:, test_idx]

    #== train the model
    print '===== Training the model...'
    sentiment = SentimentMachine(x_train.tolist(), y_train.tolist())
    w = sentiment.train(speed=0.001, stochastic=False)
    print '===== Model trained.'

    #== test efficiency of the model
    print '===== Testing the model...'
    # compute the MSE
    h = lambda a,b: sigmoid(np.dot(a,b))
    x = sentiment.compute_features_matrix(x_test.tolist())
    mse = cost(w, x, y_test, h)
    # compute the number of valid classifications
    n_test = y_test.shape[0]
    valid = 0
    for i in xrange(n_test):
        valid += 1 if sentiment.classify(x_test[i]) == y_test[i] else 0
    percent = 100.0 * valid / n_test
    # print results
    print ('== Number of well-classified documents: {0} / {1} ({2}%)'
        .format(valid, n_test, percent))
    print '== Cost value on the test set: %.4f' % mse
def get_documents():
    """
    Retrieve shuffled movie reviews from the nltk
    """
    print("Retrieving Movie Reviews\n")

    reviews = [(list(movie_reviews.words(fileid)), category)
               for category in movie_reviews.categories()
               for fileid in movie_reviews.fileids(category)]
    # so I have no idea why, but shuffle() gives me a none type
    return random.sample(reviews, len(reviews))
def getdata(percentsplit=80, numofdocs=100):
    docs = [(list(movie_reviews.words(fileid)), category)
            for category in movie_reviews.categories()
            for fileid in movie_reviews.fileids(category)]
    random.shuffle(docs)
    
    if numofdocs == 0:
        numofdocs = len(docs)
    print len(docs)
    numoftrainingexamples = numofdocs * percentsplit / 100
    traindocs, testdocs = docs[:numoftrainingexamples], docs[numoftrainingexamples:]
    return traindocs, testdocs
Exemple #17
0
def classify_document():
    from nltk.corpus import movie_reviews
    import random

    documents = [(list(movie_reviews.words(fileid)), category)
                 for category in movie_reviews.categories()
                 for fileid in movie_reviews.fileids(category)]
    random.shuffle(documents)
    all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
    allwords = [w for w, _ in all_words.most_common(2000)]
    featuresets = [(document_features(d, allwords), c) for d, c in documents]
    return classify(nltk.NaiveBayesClassifier, featuresets, 0.1)
def get_testset_trainset_nltk_mr(train_to_test_ratio=0.3):
    from nltk.corpus import movie_reviews as mr
    train_test = [[],[]]
    for category in mr.categories():
        categories_file_name_dict[category]=mr.fileids(categories=category)
    for cat in categories_file_name_dict.keys():
        li = categories_file_name_dict[cat]
        size=int(len(li)*train_to_test_ratio)
        cat_num_docs[cat]=size
        train_test[0].extend(li[:size])
        train_test[1].extend(li[size:])
    return train_test
Exemple #19
0
def train_func(func1=Sys_Params.remove_punctuation,
               func2=Sys_Params.non_stop_words,
               func3=Sys_Params.do_pos,
               func4=Sys_Params.do_lmtize_pos,
               func5=Sys_Params.high_information_words, flag=2):

    tst = "This should be a GOOD TEST"
    if flag != 3:
        def func_final(tst):
            tst = func1(tst)
            tst = func2(tst)
            tst = func3(tst)
            ans = func4(tst)
            # ans = Sys_Params.bag_of_words(tst)
            return ans
        # final_func = Sys_Params.bag_of_words(func4(func3(func2(func1()))))
        print "Passing the function"
        labels = movie_reviews.categories()
        labeled_words = [(l, movie_reviews.words(categories=[l])) for l in labels]
        high_info_words = set(Sys_Params.high_information_words(labeled_words))
        feat_det = lambda words: Sys_Params.bag_of_words_in_set(func_final(words), high_info_words)
        # feat_det = lambda words: Sys_Params.bag_of_words_in_set(words, high_info_words)
        feats = label_feat_from_corps(movie_reviews, feature_detector=feat_det)
        # # print final_func
        # return final_func
    elif flag == 3:
        def func_final(tst):
            tst = func1(tst)
            tst = func2(tst)
            tst = func3(tst)
            tst = func4(tst)
            ans = Sys_Params.bag_of_words(tst)
            return ans
        # final_func = Sys_Params.bag_of_words(func4(func3(func2(func1()))))
        print "Passing the function"
        feats = label_feat_from_corps(movie_reviews, func_final)
        # feats = label_feat_from_corps(movie_reviews)
        # print final_func
        # return final_func

    training, testing = split_label_feats(feats)
    Classifier_NB.run(training)
    nb_Classifier = Classifier_NB.load_classifier()
    Classifier_DT.run(training)
    dt_Classifier = Classifier_DT.load_classifier()
    Classifier_ME.run(training)
    me_Classifier = Classifier_ME.load_classifier()
    inst = Classifier_MV.MaxVoteClassifier(nb_Classifier, dt_Classifier, me_Classifier)
    inst.save_classifier(inst)
    print "******DONE TRAINING******"
    return
def load_movie_reviews():
    reviews = [(list(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)]
    random.shuffle(reviews)
    print('' + str(len(reviews)) + ' reviews loaded') # 2000
    if False:
        doc = reviews[0]
        print('doc type: ' + str(type(doc)) + ' length: ' + str(len(doc)))
        for elem in doc:
          print('elem type: ' + str(type(elem)) + ' length: ' + str(len(elem)))
        '''
        doc type: <type 'tuple'> length: 2
          elem type: <type 'list'> length: 711  <- array of words in the movie review
          elem type: <type 'str'>  length: 3    <- 'pos', meaning positive review
        '''
    return reviews
Exemple #21
0
def document_classification_movie_reviews():
  from nltk.corpus import movie_reviews
  documents = [(list(movie_reviews.words(fileid)), category)
                for category in movie_reviews.categories()
                for fileid in movie_reviews.fileids(category)]
  random.shuffle(documents)
  # use the most frequest 2000 words as features
  all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
  word_features = all_words.keys()[:2000]
  featuresets = [(_document_features(d, word_features), category)
                 for (d,category) in documents]
  train_set, test_set = featuresets[100:], featuresets[:100]
  classifier = nltk.NaiveBayesClassifier.train(train_set)
  print nltk.classify.accuracy(classifier, test_set)
  classifier.show_most_informative_features(30)
def get_data():
    """
    Get movie review data
    """
    dataset = []
    y_labels = []

    #extract cateogires
    for cat in movie_reviews.categories():
        #for files in each category
        for fileid in movie_reviews.fileids(cat):
            #get the words in that category
            words = list(movie_reviews.words(fileid))
            dataset.append((words, cat))
            y_labels.append(cat)
    return dataset,y_labels
Exemple #23
0
    def __init__(self):
        self.documents = [(list(movie_reviews.words(fileid)), category)
                          for category in movie_reviews.categories()
                          for fileid in movie_reviews.fileids(category)]

        random.shuffle(self.documents)

        self.stopset = set(stopwords.words('english'))
        self.K_FOLDS = 10
        self.fullfeatures = []
        self.features_X = []
        self.features_Y = []

        self.negative = ["wasn\'t", 'don\'t', 'not', 'bad', 'worst', 'ugly', 'hate']
        self.end = ['\,', '\.']
        self.negationFeatures = []
def movie_reviews_words():

	# Vou colocar todos os documentos 
	
	for category in movie_reviews.categories():
		for fileid in movie_reviews.fileids(category):
			documents.append((movie_reviews.words(fileid), category))

	#random.shuffle(documents)
	#Retorna desse jeito: ([u'plot', u':', u'a', u'human', u'space', u'astronaut', ...], u'pos')
	print(documents[0])
	print(documents[1800])
	# print('\n')

	# Vai ver quais sao as words mais frequentes em todos os documentos
	all_words = nltk.FreqDist(movie_reviews.words()) #retorna tuplas (words, frequency)
	return all_words
Exemple #25
0
  def book_train(self, feats):

    documents = [(list(movie_reviews.words(fileid)), category)
                  for category in movie_reviews.categories()
                  for fileid in movie_reviews.fileids(category)]

    random.shuffle(documents)

    self.all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words() if w not in self.stopset)
    self.word_features = self.all_words.keys()[:1000]

    featuresets = [(self.text_feats(d), c) for (d,c) in documents]
    train_set, test_set = featuresets[200:], featuresets[:200]
    self.classifier = nltk.NaiveBayesClassifier.train(train_set)

    print nltk.classify.accuracy(self.classifier, test_set)

    self.classifier.show_most_informative_features(10)
Exemple #26
0
def train_func_default():
    print "Passing the function"
    labels = movie_reviews.categories()
    labeled_words = [(l, movie_reviews.words(categories=[l])) for l in labels]
    high_info_words = set(Sys_Params.high_information_words(labeled_words))
    feat_det = lambda words: Sys_Params.bag_of_words_in_set(words, high_info_words)
    feats = label_feat_from_corps(movie_reviews, feature_detector=feat_det)
    training, testing = split_label_feats(feats)
    Classifier_NB.run(training)
    nb_Classifier = Classifier_NB.load_classifier()
    Classifier_DT.run(training)
    dt_Classifier = Classifier_DT.load_classifier()
    Classifier_ME.run(training)
    me_Classifier = Classifier_ME.load_classifier()
    inst = Classifier_MV.MaxVoteClassifier(nb_Classifier, dt_Classifier, me_Classifier)
    inst.save_classifier(inst)
    print "******DONE TRAINING******"
    return
Exemple #27
0
def get_word_dict():
    """
    Sorted the words by the frequency of words which occur in sample
    :return:
        words_freq_sorted
    """
    words_freq_sorted = list()
    word_freq_dict = collections.defaultdict(int)
    download_data_if_not_yet()

    for category in movie_reviews.categories():
        for field in movie_reviews.fileids(category):
            for words in movie_reviews.words(field):
                word_freq_dict[words] += 1
    words_sort_list = word_freq_dict.items()
    words_sort_list.sort(cmp=lambda a, b: b[1] - a[1])
    for index, word in enumerate(words_sort_list):
        words_freq_sorted.append((word[0], index))
    return words_freq_sorted
Exemple #28
0
def train_naive_bayes():
    '''
    Trains a naive bayes classifier using the nltk movie reviews corpus to
    movie relevant text as positive or negative

    NOTE:
        Easily generalizable to different domain given similarly structued corpus
    '''
    #Modified http://www.nltk.org/book/ch06.html
    documents = [(list(movie_reviews.words(fileid)), category)
                 for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)]
    random.shuffle(documents)
    all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
    global word_features
    word_features = list(all_words)[:3000]
    featuresets = [(document_features(d), c) for (d, c) in documents]
    train_set, test_set = featuresets[1000:], featuresets[:1000]
    classifier = nltk.NaiveBayesClassifier.train(train_set)

    return classifier
Exemple #29
0
def main():

    #from nltk.corpus import names
    #names = ([(name, 'male') for name in names.words('male.txt')] +
    #         [(name, 'female') for name in names.words('female.txt')])
    #random.shuffle(names)
    #print names
    #print
    
    #train_set = names
    #classifier = nltk.NaiveBayesClassifier.train(train_set)
    #name = classifier.classify("Jaime")
    #print name
    
    #featuresets = [(gender_features(n), g) for (n,g) in names]
    #train_set = [(gender_features(n), g) for (n, g) in names]
    #train_set, test_set = featuresets[500:], featuresets[:500]
    #print train_set
    #print
    
    #classifier = nltk.NaiveBayesClassifier.train(train_set)
    #name = classifier.classify(gender_features('Neo'))
    #print name
    
    documents = [(list(movie_reviews.words(fileid)), category)
        for category in movie_reviews.categories()
        for fileid in movie_reviews.fileids(category)]
    random.shuffle(documents)
    
    featuresets = [(document_features(d), c) for (d,c) in documents]
    train_set, test_set = featuresets[100:], featuresets[:100]
    print train_set
    print
    
    classifier = nltk.NaiveBayesClassifier.train(train_set)
    
    print nltk.classify.accuracy(classifier, test_set)
    classifier.show_most_informative_features(5)
    
    print 'done'
def makePrediction():

    labels = movie_reviews.categories()
    print("Labels for reviews are: {}\n".format(labels) )

    labeled_words = [(label, movie_reviews.words(categories=[label])) for label in labels]
    print("Labeled words:{}\n".format(labeled_words[:10]))

    high_info_words = set(Toolbox.high_information_words(labeled_words))
    print("High information  words:{}\n".format(list(high_info_words)[:10]))

    feat_det = lambda words: Toolbox.bag_of_words_in_set(words, high_info_words)

    lfeats = Toolbox.label_feats_from_corpus(movie_reviews, feature_detector=feat_det)

    train_feats, test_feats = Toolbox.split_label_feats(lfeats)

    mv_classifier = ClassifierTrainer.trainClassifier(train_feats)

    accuracyScore = accuracy(mv_classifier, test_feats)

    print("Accuracy is {}".format(accuracyScore))
Exemple #31
0
import random
from nltk.corpus import movie_reviews
from review_sentiment import ReviewSentiment
import classification

if __name__ == '__main__':
    labeled_data = [(movie_reviews.raw(fileids=fileid),
                     movie_reviews.categories(fileid)[0])
                    for fileid in movie_reviews.fileids()]
    random.seed(1234)
    random.shuffle(labeled_data)
    labeled_data = labeled_data[:100]
    rs = ReviewSentiment(labeled_data, train_size=50)
    classifiers = classification.train(rs)
    classification.evaluate(rs, classifiers)
    classifier = classifiers[0][0]
    print()
    print("positive reviews prediction")
    classification.predict(rs, "data/positive/", classifier, 0)
    print()
    print("negative reviews prediction")
    classification.predict(rs, "data/negative/", classifier, 0)
Exemple #32
0
# Naive Bayes Classifier

import nltk
import math
from nltk.corpus import movie_reviews, stopwords

# get our movie reviews from nltk.corpus (reviews stored as tuples (review, class))
documents = [(movie_reviews.raw(fileid), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]
classes = movie_reviews.categories()  # [’pos’, ’neg’]

trainingSet = documents[100:900] + documents[1100:1900]
devSet = documents[900:1000] + documents[1900:]
testSet = documents[:100] + documents[1000:1100]


def train(trainingSet, classes):
    # train the data
    n = len(trainingSet)  # total number of docs
    log_prior = {}  # dictionary to hold log prior for all cases

    fulltext = ""

    # dictionary that holds bigdoc for each class
    bigdoc_dict = {}

    # dictionary that holds number of docs in each class
    num_docs = {}
    for c in classes:
        bigdoc_dict[c] = ""
            votes.append(v)
        return mode(votes)

    def confidence(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)

        choice_votes = votes.count(mode(votes))
        conf = choice_votes / len(votes)
        return conf


documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

#random.shuffle(documents)

all_words = []

for w in movie_reviews.words():
    all_words.append(w.lower())

all_words = nltk.FreqDist(all_words)

word_features = list(all_words.keys())[:3000]


def find_features(document):
ne_nrr = ne_chunk(ne_tag)

new = "The cat ate the little mouse who was after fresh cheese"
new_token = nltk.pos_tag(word_tokenize(new))
grammer_np = r"NP: {<DT>?<JJ>*<NN>}"
chunk_praser = nltk.RegexpParser(grammer_np)
chunk_result = chunk_praser.parse(new_token)
import pandas as pd
import numpy as np
import sklearn

from sklearn.feature_extraction.text import CountVectorizer

from nltk.corpus import movie_reviews
movie_reviews.categories()
pos_rev = movie_reviews.fileids("pos")
neg_rev = movie_reviews.fileids("neg")
rev = nltk.corpus.movie_reviews.words('pos/cv565_29572.txt')
rev_list = []

for rev in neg_rev:
    rev_text_neg = rev = nltk.corpus.movie_reviews.words(rev)
    review_one_string = " ".join(rev_text_neg)
    review_one_string = review_one_string.replace(",", ",")
    review_one_string = review_one_string.replace(".", ".")
    review_one_string = review_one_string.replace("\' ", "'")
    review_one_string = review_one_string.replace(" \'", "'")
    rev_list.append(review_one_string)

len(rev_list)
from sklearn.svm import SVC, LinearSVC, NuSVC
from nltk.classify import ClassifierI
from statistics import mode

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
import nltk
import os
import nltk.corpus

print(os.listdir(nltk.data.find('corpora')))

from nltk.corpus import movie_reviews
print(movie_reviews.categories())

print(len(movie_reviews.fileids('pos')))
print()
print(movie_reviews.fileids('pos'))
neg_rev = movie_reviews.fileids('neg')
print(len(neg_rev))
print(neg_rev)
rev = nltk.corpus.movie_reviews.words('pos/cv000_29590.txt')
print(rev)

rev_list = []

for rev in neg_rev:
    rev_text_neg = rev = nltk.corpus.movie_reviews.words(rev)
    review_one_string = " ".join(rev_text_neg)
import nltk
from nltk.corpus import movie_reviews
from pylab import plot,show
from numpy import array
from numpy.random import rand
from scipy.cluster.vq import kmeans,vq,whiten
import numpy as np
import random

documents = [(' '.join(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)]  #转化为词列表的影评,与标签,组成二元组
random.shuffle(documents)	
documents_words=[w for (w,t) in documents]

from sklearn import feature_extraction  
from sklearn.feature_extraction.text import TfidfTransformer  
from sklearn.feature_extraction.text import CountVectorizer  

vectorizer=CountVectorizer(min_df=20,stop_words='english')#该类会将文本中的词语转换为词频矩阵,矩阵元素a[i][j] 表示j词在i类文本下的词频  
transformer=TfidfTransformer()#该类会统计每个词语的tf-idf权值  
tfidf=transformer.fit_transform(vectorizer.fit_transform(documents_words))#fit_transform计算tf-idf,fit_transform将文本转为词频矩阵  
word=vectorizer.get_feature_names()#获取词袋模型中的所有词语  
features=tfidf.toarray()#将tf-idf矩阵抽取出来,元素a[i][j]表示j词在i类文本中的tf-idf权重  


target=[c for (d,c) in documents]

data=whiten(features)
centroids,_ = kmeans(data,2)
idx,_ = vq(data,centroids)

target1=[1 if x =='pos' else 0 for x in target]
print(w5.wup_similarity(w4))





###################### Text Classification ##########################


from nltk.corpus import movie_reviews   #already labeled
import random
import nltk

#category: pos neg    get words vectors of every file in movie reviews
documents=[(list(movie_reviews.words(fileid)),category) for category in movie_reviews.categories()
           for fileid in movie_reviews.fileids(category)] #tuples

#words: features  makes up elements

random.shuffle(documents)  #洗牌

all_words=[]
for w in movie_reviews.words():
    all_words.append(w.lower())
    
    
# nltk words frequency distribution
#39768 in total
allwords=nltk.FreqDist(all_words)
print(allwords.most_common(15))
Exemple #38
0
import nltk
nltk.download('reuters')
from nltk.corpus import reuters
reuters.fileids()
reuters.categories()
fileid = 'test/16399'
text = reuters.raw(fileid)
text1=reuters.raw(categories='zinc')
reuters.categories(fileid)


import nltk
nltk.download('movie_reviews')
from nltk.corpus import movie_reviews
movie_reviews.fileids()
movie_reviews.categories()
fileid = 'neg/cv956_12547.txt'
text = movie_reviews.raw(fileid)
text1= movie_reviews.raw(categories='neg')
movie_reviews.categories(fileid)


#Frequency distribution by creating our own corpus

from nltk.corpus import PlaintextCorpusReader
fileid = 'C:/Users/ITRAIN-12/Desktop/Day 2/gaming.txt'
my_corpus = PlaintextCorpusReader(fileid, '.*')
text = my_corpus.raw(fileid)
text
my_corpus.raw(fileid)
my_corpus.words(fileid)
#############################################################################

from featx import label_feats_from_corpus, split_label_feats, high_information_words, bag_of_words_in_set
from classification import precision_recall  # classification.py debe estar en el mismo dir.
from nltk.corpus import movie_reviews
from nltk.classify.util import accuracy
from nltk.classify import NaiveBayesClassifier
from nltk.classify import MaxentClassifier
from nltk.classify import DecisionTreeClassifier
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import LinearSVC

# "high information word" es una palabra que esta fuertemente sesgada hacia una unica etiqueta de clasificacion
# "low information words"  es una o varias palabras que son comunes en todas las etiquetas de clasificacion.

labels = movie_reviews.categories()
labeled_words = [(l, movie_reviews.words(categories=[l])) for l in labels]

high_info_words = set(high_information_words(labeled_words))
feat_det = lambda words: bag_of_words_in_set(words, high_info_words)

lfeats = label_feats_from_corpus(movie_reviews, feature_detector=feat_det)
train_feats, test_feats = split_label_feats(lfeats)

print("######################################################################")
nb_classifier = NaiveBayesClassifier.train(train_feats)
print("Accuracy Naive Bayes: " + str(accuracy(nb_classifier, test_feats)))
# Accuracy: 0.91
nb_precisions, nb_recalls = precision_recall(nb_classifier, test_feats)
print("Precisions Naive Bayes Pos: " + str(nb_precisions['pos']))
# Precisions Pos: 0.8988326848249028
Exemple #40
0
#!/usr/bin/env python
# coding: utf-8

# In[27]:

import nltk
from nltk import NaiveBayesClassifier
from nltk.corpus import movie_reviews
import random

# In[28]:

cats = movie_reviews.categories()
reviews = []
for a in cats:
    for fid in movie_reviews.fileids(a):
        review = (list(movie_reviews.words(fid)), a)
        reviews.append(review)
random.shuffle(reviews)

# In[3]:

all_wd_in_reviews = nltk.FreqDist(wd.lower() for wd in movie_reviews.words())
top_wd_in_reviews = [
    list(wds) for wds in zip(*all_wd_in_reviews.most_common(2000))
][0]

# In[4]:


def ext_ft(review, top_words):
Exemple #41
0
def main():
    # - categories 'neg' and 'pos'
    # - fileid ex - 'neg/cv000_29416.txt'
    documents = [(list(movie_reviews.words(fileid)), category)
                 for category in movie_reviews.categories()
                 for fileid in movie_reviews.fileids(category)]

    random.shuffle(documents)

    training_set = documents[:1600]  #1600 sentences in training set
    held_out_set = documents[-400:-200]  #200 sentences in held out set
    test_set = documents[-200:]  # 200 sentences in test set

    sentiment = Sentiment()
    sentiment.Set_uni_bi_stat(training_set)
    sentiment.Conclude_uni_bi_stat()

    real_labels = sentiment.Real_labels(held_out_set)

    # MaxEnt with only Bigram features
    print "--- MaxEnt---"

    cutoffs1 = [0, 25, 50, 100, 250, 500, 1000]
    cutoffs2 = [0, 12, 25, 50, 125, 250, 500]
    sentiment.Uni_count_list()
    sentiment.Bi_count_list()

    # with info gain
    print "--- Unigram + Bigram with information gain---"
    sentiment.Uni_high_score_list()
    sentiment.Bi_high_score_list()

    for cutoff in cutoffs2:
        print "uni features: 500"
        print "bi features: " + str(cutoff)

        trainsets_uni = sentiment.Feature_set_uni(500, training_set, 2)
        trainsets_bi = sentiment.Feature_set_bi(cutoff, training_set, 2)

        # combine the feature set of unigram and bigram
        trainsets_uni_bi = []
        for i in range(len(trainsets_uni)):
            temp = dict(trainsets_uni[i][0])
            temp.update(trainsets_bi[i][0])
            trainsets_uni_bi.append((temp, trainsets_uni[i][1]))

        held_out_sets_uni = sentiment.Feature_set_uni(500, held_out_set, 2)
        held_out_sets_bi = sentiment.Feature_set_bi(cutoff, held_out_set, 2)
        held_out_sets_uni_bi = []
        for i in range(len(held_out_sets_uni)):
            temp = dict(held_out_sets_uni[i][0])
            temp.update(held_out_sets_bi[i][0])
            held_out_sets_uni_bi.append((temp, held_out_sets_uni[i][1]))

        algorithm = nltk.classify.MaxentClassifier.ALGORITHMS[0]
        classifier_uni_bi = nltk.MaxentClassifier.train(trainsets_uni_bi,
                                                        algorithm,
                                                        max_iter=50)
        classifier_uni_bi.show_most_informative_features(10)
        print "\nClassifier Accuracy : %4f\n" % nltk.classify.accuracy(
            classifier_uni_bi, held_out_sets_uni_bi)
        predicted_labels_uni_bi = sentiment.Maxent_predicted(
            classifier_uni_bi, held_out_sets_uni_bi)
        sentiment.Evaluation(predicted_labels_uni_bi, real_labels)

    # without infor grain
    print "--- Unigram + Bigram without info gain---"
    for cutoff in cutoffs1:
        print "uni features: 1000"
        print "bi features: " + str(cutoff)

        trainsets_uni = sentiment.Feature_set_uni(1000, training_set, 1)
        trainsets_bi = sentiment.Feature_set_bi(cutoff, training_set, 1)

        # combine the feature set of unigram and bigram
        trainsets_uni_bi = []
        for i in range(len(trainsets_uni)):
            temp = dict(trainsets_uni[i][0])
            temp.update(trainsets_bi[i][0])
            trainsets_uni_bi.append((temp, trainsets_uni[i][1]))
        held_out_sets_uni = sentiment.Feature_set_uni(1000, held_out_set, 1)
        held_out_sets_bi = sentiment.Feature_set_bi(cutoff, held_out_set, 1)
        held_out_sets_uni_bi = []
        for i in range(len(held_out_sets_uni)):
            temp = dict(held_out_sets_uni[i][0])
            temp.update(held_out_sets_bi[i][0])
            held_out_sets_uni_bi.append((temp, held_out_sets_uni[i][1]))

        algorithm = nltk.classify.MaxentClassifier.ALGORITHMS[0]
        classifier_uni_bi = nltk.MaxentClassifier.train(trainsets_uni_bi,
                                                        algorithm,
                                                        max_iter=50)
        classifier_uni_bi.show_most_informative_features(10)
        print "\nClassifier Accuracy : %4f\n" % nltk.classify.accuracy(
            classifier_uni_bi, held_out_sets_uni_bi)
        predicted_labels_uni_bi = sentiment.Maxent_predicted(
            classifier_uni_bi, held_out_sets_uni_bi)
        sentiment.Evaluation(predicted_labels_uni_bi, real_labels)
def main():
	# - categories 'neg' and 'pos'
	# - fileid ex - 'neg/cv000_29416.txt'
	documents = [(list(movie_reviews.words(fileid)), category)
		for category in movie_reviews.categories()
			for fileid in movie_reviews.fileids(category)]
	random.shuffle(documents)

	(documents_adj, documents_adj_adv, documents_adj_adv_v) = POS(documents)
	
	cutoffs = [100, 200, 500, 1000, 2000, 3000]

	# adj only
	print "--- adj only ---"
	training_set_adj = documents_adj[:1600] #1600 sentences in training set
	held_out_set_adj = documents_adj[-400:-200] #200 sentences in held out set
	test_set_adj = documents_adj[-200:] # 200 sentences in test out set

	sentiment_adj = Sentiment()
	sentiment_adj.Set_word_stat(training_set_adj)
	sentiment_adj.Conclude_word_stat()
	real_labels_adj = sentiment_adj.Real_labels(held_out_set_adj)

	# without info gain
	sentiment_adj.Uni_count_list()

	for cutoff in cutoffs:
		print "cutoff = " + str(cutoff)
		trainsets_adj = sentiment_adj.Feature_set_uni(cutoff, training_set_adj, 1)
		held_out_sets_adj = sentiment_adj.Feature_set_uni(cutoff, held_out_set_adj, 1)
		algorithm = nltk.classify.MaxentClassifier.ALGORITHMS[0]
		classifier_adj = nltk.MaxentClassifier.train(trainsets_adj, algorithm, max_iter = 50)
		classifier_adj.show_most_informative_features(10)
		print "\nClassifier Accuracy : %4f\n" % nltk.classify.accuracy(classifier_adj, held_out_sets_adj)
		predicted_labels_adj = sentiment_adj.Maxent_predicted(classifier_adj, held_out_sets_adj)
		sentiment_adj.Evaluation(predicted_labels_adj, real_labels_adj)
	
	# with info gain
	print "--- adj only with information gain---"
	sentiment_adj.Uni_high_score_list()

	for cutoff in cutoffs:
		print "cutoff = " + str(cutoff)
		trainsets_adj = sentiment_adj.Feature_set_uni(cutoff, training_set_adj, 2)
		held_out_sets_adj = sentiment_adj.Feature_set_uni(cutoff, held_out_set_adj, 2)
		algorithm = nltk.classify.MaxentClassifier.ALGORITHMS[0]
		classifier_adj = nltk.MaxentClassifier.train(trainsets_adj, algorithm, max_iter = 50)
		classifier_adj.show_most_informative_features(10)
		print "\nClassifier Accuracy : %4f\n" % nltk.classify.accuracy(classifier_adj, held_out_sets_adj)
		predicted_labels_adj = sentiment_adj.Maxent_predicted(classifier_adj, held_out_sets_adj)
		sentiment_adj.Evaluation(predicted_labels_adj, real_labels_adj)

	# adj + adv
	print "--- adj + adv ---"
	training_set_adj_adv = documents_adj_adv[:1600] #1600 sentences in training set
	held_out_set_adj_adv = documents_adj_adv[-400:-200] #200 sentences in held out set
	test_set_adj_adv = documents_adj_adv[-200:] # 200 sentences in test set

	sentiment_adj_adv = Sentiment()
	sentiment_adj_adv.Set_word_stat(training_set_adj_adv)
	sentiment_adj_adv.Conclude_word_stat()

	real_labels_adj_adv = sentiment_adj_adv.Real_labels(held_out_set_adj_adv)

	# without info gain
	sentiment_adj_adv.Uni_count_list()

	for cutoff in cutoffs:
		print "cutoff = " + str(cutoff)
		trainsets_adj_adv = sentiment_adj_adv.Feature_set_uni(cutoff, training_set_adj_adv, 1)
		held_out_sets_adj_adv = sentiment_adj_adv.Feature_set_uni(cutoff, held_out_set_adj_adv, 1)
		algorithm = nltk.classify.MaxentClassifier.ALGORITHMS[0]
		classifier_adj_adv = nltk.MaxentClassifier.train(trainsets_adj_adv, algorithm, max_iter = 50)
		classifier_adj_adv.show_most_informative_features(10)
		print "\nClassifier Accuracy : %4f\n" % nltk.classify.accuracy(classifier_adj_adv, held_out_sets_adj_adv)
		predicted_labels_adj_adv = sentiment_adj_adv.Maxent_predicted(classifier_adj_adv, held_out_sets_adj_adv)
		sentiment_adj_adv.Evaluation(predicted_labels_adj_adv, real_labels_adj_adv)
	
	# with info gain
	print "--- adj + adv with information gain---"
	sentiment_adj_adv.Uni_high_score_list()

	for cutoff in cutoffs:
		print "cutoff = " + str(cutoff)
		trainsets_adj_adv = sentiment_adj_adv.Feature_set_uni(cutoff, training_set_adj_adv, 2)
		held_out_sets_adj_adv = sentiment_adj_adv.Feature_set_uni(cutoff, held_out_set_adj_adv, 2)
		algorithm = nltk.classify.MaxentClassifier.ALGORITHMS[0]
		classifier_adj_adv = nltk.MaxentClassifier.train(trainsets_adj_adv, algorithm, max_iter = 50)
		classifier_adj_adv.show_most_informative_features(10)
		print "\nClassifier Accuracy : %4f\n" % nltk.classify.accuracy(classifier_adj_adv, held_out_sets_adj_adv)
		predicted_labels_adj_adv = sentiment_adj_adv.Maxent_predicted(classifier_adj_adv, held_out_sets_adj_adv)
		sentiment_adj_adv.Evaluation(predicted_labels_adj_adv, real_labels_adj_adv)

	# adj + adv + v
	print "--- adj + adv + v---"
	training_set_adj_adv_v = documents_adj_adv_v[:1600] #1600 sentences in training set
	held_out_set_adj_adv_v = documents_adj_adv_v[-400:-200] #200 sentences in held out set
	test_set_adj_adv_v = documents_adj_adv_v[-200:] # 200 sentences in test set

	sentiment_adj_adv_v = Sentiment()
	sentiment_adj_adv_v.Set_word_stat(training_set_adj_adv_v)
	sentiment_adj_adv_v.Conclude_word_stat()

	real_labels_adj_adv_v = sentiment_adj_adv_v.Real_labels(held_out_set_adj_adv_v)

	# without info gain
	sentiment_adj_adv_v.Uni_count_list()

	for cutoff in cutoffs:
		print "cutoff = " + str(cutoff)
		trainsets_adj_adv_v = sentiment_adj_adv_v.Feature_set_uni(cutoff, training_set_adj_adv_v, 1)
		held_out_sets_adj_adv_v = sentiment_adj_adv_v.Feature_set_uni(cutoff, held_out_set_adj_adv_v, 1)
		algorithm = nltk.classify.MaxentClassifier.ALGORITHMS[0]
		classifier_adj_adv_v = nltk.MaxentClassifier.train(trainsets_adj_adv_v, algorithm, max_iter = 50)
		classifier_adj_adv_v.show_most_informative_features(10)
		print "\nClassifier Accuracy : %4f\n" % nltk.classify.accuracy(classifier_adj_adv_v, held_out_sets_adj_adv_v)
		predicted_labels_adj_adv_v = sentiment_adj_adv_v.Maxent_predicted(classifier_adj_adv_v, held_out_sets_adj_adv_v)
		sentiment_adj_adv_v.Evaluation(predicted_labels_adj_adv_v, real_labels_adj_adv_v)
	
	# without info gain
	print "--- adj + adv + v with information gain---"
	sentiment_adj_adv_v.Uni_high_score_list()

	for cutoff in cutoffs:
		print "cutoff = " + str(cutoff)
		trainsets_adj_adv_v = sentiment_adj_adv_v.Feature_set_uni(cutoff, training_set_adj_adv_v, 2)
		held_out_sets_adj_adv_v = sentiment_adj_adv_v.Feature_set_uni(cutoff, held_out_set_adj_adv_v, 2)
		algorithm = nltk.classify.MaxentClassifier.ALGORITHMS[0]
		classifier_adj_adv_v = nltk.MaxentClassifier.train(trainsets_adj_adv_v, algorithm, max_iter = 50)
		classifier_adj_adv_v.show_most_informative_features(10)
		print "\nClassifier Accuracy : %4f\n" % nltk.classify.accuracy(classifier_adj_adv_v, held_out_sets_adj_adv_v)
		predicted_labels_adj_adv_v = sentiment_adj_adv_v.Maxent_predicted(classifier_adj_adv_v, held_out_sets_adj_adv_v)
		sentiment_adj_adv_v.Evaluation(predicted_labels_adj_adv_v, real_labels_adj_adv_v)
def preprocess(checkpoint=True):
    """
	Reads, gives format and concatenate data frames into one.
	:param checkpoint: True to save data frame: bool 
	"""
    # getting nltk dataset:
    documents = [(movie_reviews.raw(fileid), category)
                 for category in movie_reviews.categories()
                 for fileid in movie_reviews.fileids(category)]
    # data framing
    nltk_df = pd.DataFrame()
    for review, category in documents:
        temp = pd.DataFrame(data={
            'text': review,
            'category': category
        },
                            index=[0])
        nltk_df = nltk_df.append(temp)
    nltk_df.reset_index(drop=True, inplace=True)
    nltk_df['category'] = nltk_df['category'].map(lambda x: 0
                                                  if x == 'neg' else 1)

    # getting tweets dataset from stanford:
    tweets_df = pd.read_csv(
        '/Mining_The_Social_Web/datasets/tweetsstanford_training.csv',
        sep=',',
        header=None,
        names=['category', 'id', 'date', 'query', 'user', 'text'])
    tweets_df['category'] = tweets_df['category'].map(lambda x: 1
                                                      if x == 4 else 0)

    # getting dataset from University of Michigan:
    umich_df = pd.read_csv(
        '/Mining_The_Social_Web/datasets/umich_training.txt',
        sep="\t",
        header=None,
        names=['category', 'text'])

    # getting reviews dataset from Amazon:
    amazon_df = pd.read_csv(
        '/Mining_The_Social_Web/datasets/amazon_cells_labelled.txt',
        sep="\t",
        header=None,
        names=['text', 'category'])

    # getting review dataset from IMDB
    imdb_df = pd.read_csv('/Mining_The_Social_Web/datasets/imdb_labelled.txt',
                          sep="\t",
                          header=None,
                          names=['text', 'category'])

    # getting review dataset from Yelp
    yelp_df = pd.read_csv('/Mining_The_Social_Web/datasets/yelp_labelled.txt',
                          sep="\t",
                          header=None,
                          names=['text', 'category'])

    # concatenate ALL:
    trainset_df = pd.concat([
        nltk_df, tweets_df[['category', 'text']], umich_df, yelp_df, imdb_df,
        amazon_df
    ])
    trainset_df.reset_index(drop=True, inplace=True)

    if checkpoint:
        trainset_df.to_csv(
            path_or_buf='/Mining_The_Social_Web/datasets/alltrainset.csv',
            header=['category', 'text'],
            columns=['category', 'text'],
            index=None,
            sep='\t',
            mode='w')
    return trainset_df
Exemple #44
0
def test_movie_reviews():
  """ http://www.cs.cornell.edu/people/pabo/movie-review-data/
      http://www.nltk.org/book/ch06.html


      https://streamhacker.com/2010/05/10/text-classification-sentiment-analysis-naive-bayes-classifier/
https://streamhacker.com/2010/05/24/text-classification-sentiment-analysis-stopwords-collocations/
      https://streamhacker.com/2010/06/16/text-classification-sentiment-analysis-eliminate-low-information-features/


evaluating single word features
accuracy: 0.728
pos precision: 0.651595744681
pos recall: 0.98
neg precision: 0.959677419355
neg recall: 0.476


evaluating best word features
accuracy: 0.93
pos precision: 0.890909090909
pos recall: 0.98
neg precision: 0.977777777778
neg recall: 0.88


Signficant Bigrams
evaluating best words + bigram chi_sq word features
accuracy: 0.92
pos precision: 0.913385826772
pos recall: 0.928
neg precision: 0.926829268293
neg recall: 0.912


NaiveBayesClassifier
train on 1900 instances, test on 100 instances
pos precision: 0.7435897435897436
pos recall: 0.5370370370370371
pos F-measure: 0.6236559139784946
neg precision: 0.5901639344262295
neg recall: 0.782608695652174
neg F-measure: 0.6728971962616822

Rules-based SentimentAnalyzer
pos precision:0.6031746031746031
pos recall:0.7037037037037037
pos F-measure:0.6495726495726496
neg precision:0.5675675675675675
neg recall:0.45652173913043476
neg F-measure:0.5060240963855421


      """

  from nltk.corpus import movie_reviews
  from nltk.metrics import precision, recall, f_measure
  from nltk.classify import NaiveBayesClassifier
  import random  
  import collections

  # data
  documents = [(list(movie_reviews.words(fileid)), category)
    for category in movie_reviews.categories()
    for fileid in movie_reviews.fileids(category)]
  
  #random.shuffle(documents)
  
  # SET this line is only present for some debug reason. Remove it when the development is done.
  documents = documents[:200]

  train_docs = documents[100:]
  test_docs = documents[:100]


  # negids = movie_reviews.fileids('neg')
  # posids = movie_reviews.fileids('pos')

  # negfeats = [(featx(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
  # posfeats = [(featx(movie_reviews.words(fileids=[f])), 'pos') for f in posids]

  # negcutoff = len(negfeats)*3/4
  # poscutoff = len(posfeats)*3/4

  # trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
  # testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
 
 

  # Machine Learning approach
  do_ML = False                             # SET  
  refsets = collections.defaultdict(set)

  if do_ML:
    print ('NaiveBayesClassifier')

    # preprocessing 
    print ('+ preprocessing')  
    all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
    word_features = list(all_words)[:2000] 

    def document_features(document): 
      document_words = set(document) 
      features = {}
      for word in word_features:
          features['contains({})'.format(word)] = (word in document_words)
      return features

    train_featuresets = [(document_features(d), c) for (d,c) in train_docs]
    test_featuresets = [(document_features(d), c) for (d,c) in test_docs]

    # training
    print ('+ train on %d instances' % (len(train_featuresets)))
    classifier = nltk.NaiveBayesClassifier.train(train_featuresets)


    # testing
    print ('+ test on %d instances' % ( len(test_featuresets)))
    classifier_hypsets = collections.defaultdict(set)
    
    for i, (feats, label) in enumerate(test_featuresets):
      refsets[label].add(i)
      classifier_hyp = classifier.classify(feats)
      classifier_hypsets[classifier_hyp].add(i)
   
    print ('pos precision:', precision(refsets['pos'], classifier_hypsets['pos']))
    print ('pos recall:', recall(refsets['pos'], classifier_hypsets['pos']))
    print ('pos F-measure:', f_measure(refsets['pos'], classifier_hypsets['pos']))
    print ('neg precision:', precision(refsets['neg'], classifier_hypsets['neg']))
    print ('neg recall:', recall(refsets['neg'], classifier_hypsets['neg']))
    print ('neg F-measure:', f_measure(refsets['neg'], classifier_hypsets['neg']))


  # 
  print ('Rules-based SentimentAnalyzer')
  sa = SentimentAnalyzer()       

  # preprocessing 
  print ('+ preprocessing')  

  def pyrata_structure_as_features (doc):
    tokens_pos = nltk.pos_tag(doc)
    pyrata_tokens = [{'raw':w, 'pos':p, 'lc':w.lower()} for (w, p) in tokens_pos]
    return pyrata_tokens

  train_featuresets = [(pyrata_structure_as_features(d), c) for (d,c) in train_docs]
  test_featuresets = [(pyrata_structure_as_features(d), c) for (d,c) in test_docs]

  #
  print ('+ train on %d instances' % (len(train_featuresets)))
  for i, (doc, label) in enumerate(train_featuresets):
    print ('Debug: label={}'.format(label))
    sa.booster_extraction(doc)

  # testing
  print ('+ test on %d instances' % (len(test_featuresets)))  
  rules_based_hypsets = collections.defaultdict(set)

  for i, (doc, label) in enumerate(test_featuresets):
    #print ('Debug: doc={}'.format(doc))

    rules_based_hyp = sa.label_polarity(doc)
    rules_based_hypsets[rules_based_hyp].add(i)
 
  print ('pos precision:{:10}'.format(precision(refsets['pos'], rules_based_hypsets['pos'])))
  print ('pos recall:{:10}'.format(recall(refsets['pos'], rules_based_hypsets['pos'])))
  print ('pos F-measure:{:10}'.format(f_measure(refsets['pos'], rules_based_hypsets['pos'])))
  print ('neg precision:{:10}'.format(precision(refsets['neg'], rules_based_hypsets['neg'])))
  print ('neg recall:{:10}'.format(recall(refsets['neg'], rules_based_hypsets['neg'])))
  print ('neg F-measure:{:10}'.format(f_measure(refsets['neg'], rules_based_hypsets['neg'])))
Exemple #45
0
def main():
    global documents
    global all_words # contains all the words in movie reviews
    global word_features  # list containing all the word features
    global featuresets
    global training_set
    global testing_set

    documents = [(list(movie_reviews.words(fileid)), category)  # [(review, category)
                 for category in movie_reviews.categories()
                 for fileid in movie_reviews.fileids(category)]
    all_words = []

    # documents = []
    # for category in movie_reviews.categories():
    # 	for fileid in movie_reviews.fileids(category):
    # 		documents.append(list(movie_reviews.words(fileid)), category)

    random.shuffle(documents)  # to prevent extreme bias

    # print(documents[1])

    for w in movie_reviews.words():
        all_words.append(w.lower())  # we need to convert all words to lower case

    all_words = nltk.FreqDist(all_words)
    word_features = list(all_words.keys())[:3000]  # top 3000 words

    # print(all_words.most_common(15))  # prints out the 15 most common words
    # print("Number of times stupid pops up {}".format(all_words["stupid"]))
    # print((find_features(movie_reviews.words('neg/cv000_29416.txt'), word_features)))
    featuresets = [(find_features(rev, word_features), category) for (rev, category) in documents]

    training_set = featuresets[:1900]
    testing_set = featuresets[1900:]

    classifier = nltk.NaiveBayesClassifier.train(training_set)
    print("Naive Bayes Algorithm accuracy: ", (nltk.classify.accuracy(classifier, testing_set)) * 100)
    classifier.show_most_informative_features(15)

    # save_classifier = open("naivebayes.pickle", "wb")
    # pickle.dump(classifier, save_classifier)
    # save_classifier.close()

    classifier_f = open("naivebayes.pickle", "rb")
    classifier = pickle.load(classifier_f)
    print("Naive Bayes Algorithm accuracy: ", (nltk.classify.accuracy(classifier, testing_set)) * 100)
    classifier_f.close()

    # MultinomialNB
    MNB_classifier = SklearnClassifier(MultinomialNB())
    MNB_classifier.train(training_set)
    print("MNB Classifier Algorithm accuracy: ", (nltk.classify.accuracy(MNB_classifier, testing_set)) * 100)

    # GaussianNB
    # GaussianNB_classifier = SklearnClassifier(GaussianNB())
    # GaussianNB_classifier.train(training_set)
    # print("GaussianNB Classifier Algorithm accuracy: ", (nltk.classify.accuracy(GaussianNB_classifier, testing_set)) * 100)

    # BernoulliNB
    BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
    BernoulliNB_classifier.train(training_set)
    print("BernoulliNB Classifier Algorithm accuracy: ", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set)) * 100)

    # LogisticRegression, SGDClassifier
    # SVC, LinearSVC, NuSVC

    # Logistic Regression
    LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
    LogisticRegression_classifier.train(training_set)
    print("LogisticRegression Classifier Algorithm accuracy: ", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set)) * 100)

    # SGDClassifier
    SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
    SGDClassifier_classifier.train(training_set)
    print("SGDClassifier Classifier Algorithm accuracy: ", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set)) * 100)

    # SVC
    SVC_classifier = SklearnClassifier(SVC())
    SVC_classifier.train(training_set)
    print("SVC Classifier Algorithm accuracy: ", (nltk.classify.accuracy(SVC_classifier, testing_set)) * 100)

    # LinearSVC
    LinearSVC_classifier = SklearnClassifier(LinearSVC())
    LinearSVC_classifier.train(training_set)
    print("LinearSVC Classifier Algorithm accuracy: ", (nltk.classify.accuracy(LinearSVC_classifier, testing_set)) * 100)

    # NuSVC
    NuSVC_classifier = SklearnClassifier(NuSVC())
    NuSVC_classifier.train(training_set)
    print("NuSVC Classifier Algorithm accuracy: ", (nltk.classify.accuracy(NuSVC_classifier, testing_set)) * 100)
Exemple #46
0
def evaluate(test, category_list):
	'''
	Function returns number of reviews the tags of which we predicted correctly. 
	'''
	return 100.0*sum(1 for x in test_model(test, category_list) 
		if x[0] == mr.categories(x[1])[0])/len(test)
Exemple #47
0
import nltk
import random
from nltk.corpus import movie_reviews

print(movie_reviews.categories())
# ['pos','neg']

docs = [(list(movie_reviews.words(fileid)), category)
        for category in movie_reviews.categories()
        for fileid in movie_reviews.fileids(category)]

random.shuffle(docs)

all_words = []
for w in movie_reviews.words():
    all_words.append(w)
all_words = nltk.FreqDist(all_words)

print(all_words("gorgeous"))
#50
Exemple #48
0
from nltk.corpus import movie_reviews

print(movie_reviews.categories(movie_reviews.fileids()[10]))
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews
nltk.download("stopwords")


def extract_features(word_list):
    return dict([(word, True) for word in word_list])


# Create a list of movie review document
documents = []

for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        # documents.append((list(movie_reviews.words(fileid)), category))
        documents.append((movie_reviews.words(fileid), category))

if __name__ == '__main__':
    # Load positive and negative reviews
    positive_fileids = movie_reviews.fileids('pos')
    negative_fileids = movie_reviews.fileids('neg')
    # print("No.of.postive fileds",positive_fileids)
    # print("No.of.Negative fields",negative_fileids)

    # Total reviews
    print("Total No.of.Reviews in Movies",
          len(movie_reviews.fileids()))  # Output: 2000

    # Review categories
    print("Categoriacal variables",
Exemple #50
0
"""
Sentiment Analysis-Movie Reviews using NLTK

@author: Sathish Sampath([email protected])
Developed as part of  Microsoft's NLP MOOC(https://www.edx.org/course/natural-language-processing-nlp)


"""

# movie reviews / sentiment analysis
import nltk
from nltk.corpus import movie_reviews as reviews
import random

# Input Documents
docs = [(list(reviews.words(id)), cat) for cat in reviews.categories()
        for id in reviews.fileids(cat)]

# Shuffle the input
random.shuffle(docs)

fd = nltk.FreqDist(word.lower() for word in reviews.words())
topKeys = [key for (key, value) in fd.most_common(2000)]


def review_features(doc):
    docSet = set(doc)
    features = {}

    for word in topKeys:
        features[word] = (word in docSet)
Exemple #51
0
import nltk
import random
from nltk.corpus import movie_reviews

documents = []

for category in movie_reviews.categories():  # category is pos or neg
    for fileid in movie_reviews.fileids(category):
        documents.append((list(movie_reviews.words(fileid)), category))

random.shuffle(documents)

#print(documents[1])

all_words = []
for w in movie_reviews.words():
    all_words.append(w.lower())

all_words = nltk.FreqDist(all_words)  # convert into a format that nlzk can use
print(all_words.most_common(15))

print(all_words["stupid"]
      )  # how much times does this word get used in the reviews
Exemple #52
0
        s = s.encode('utf8')
        if s == predictions[p]:
            correct += 1
        p = p + 1
    return (correct / float(len(test))) * 100.0


#The calling code
voc = []  #Contains vocabularies
voc = movie_reviews.words()

docs = []  #Contains all docs irrespective of category
docs = movie_reviews.fileids()

C = []
C = movie_reviews.categories()  ##Contains all the categories

#splitting the data set
splitRatio_train = 0.60
splitRatio_test = 0.20
splitRatio_cross = 0.20
train, test, cross = splitDataset(docs, splitRatio_train, splitRatio_test,
                                  splitRatio_cross)

#Calling the TrainBernoulli function with the training data
V, prior, condprob = TrainBernoulli(C, train)

#Calling the ApplyBernoulliNB function
score = []  #List that stores the predicted class labels of the test data
score = ApplyBernoulliNB(C, V, prior, condprob, test)
"""

################ MOVIE REVIEWS - SENTIMENT ANALYSIS ##############

################ IMPORT DATA AND EXPLORE #########################
# Importing nltk & random package
import nltk
import random
# Imporing the dataset & stopwords corpus
from nltk.corpus import stopwords
from nltk.corpus import movie_reviews


# Creating documents list which stores file name and its category as pos or neg
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]
# we randomly shuffle the documents before creating training and testing datasets
random.shuffle(documents)


# randomly chosing 40th document to see its content
print(documents[40])

# Listing categories
movie_reviews.categories()

# Listing unique file ids
movie_reviews.fileids()

# Finding out number of categories
Exemple #54
0
nltk.download('words')

from nltk.corpus import wordnet as wn
from nltk.corpus import movie_reviews
from nltk.corpus import sentiwordnet as wdn
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.util import ngrams

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS

pp = pprint.PrettyPrinter(indent=4)

neg, pos = movie_reviews.categories()

new_phrases = []
for ids in movie_reviews.fileids(neg):
    for phrase in movie_reviews.sents(ids)[1:]:
        if len(phrase) > 3:
            new_phrases.append({
                'type': 'neg',
                'phrase': ' '.join(phrase).lower(),
                'pos_score': 0.0,
                'neg_score': 0.0,
                'over_score': 0.0
            })
for ids in movie_reviews.fileids(pos):
    for phrase in movie_reviews.sents(ids):
        if len(phrase) > 3:
w2v = KeyedVectors.load_word2vec_format("wiki.en.5k.vec", binary=False)
#print("Number of words: %d" % len(w2v.vocab))

def we_represent(tokens):
    vec = numpy.zeros(300)
    for tok in tokens:
        if tok.lower() in w2v:
            vec += w2v[tok]
    return vec

training_instances = []
training_labels = []
test_instances = []
test_labels = []

for label in movie_reviews.categories():
    for fileid in movie_reviews.fileids(label):
        doc = movie_reviews.words(fileid)
        instance = we_represent(doc)
        if label == 'pos':
            lbl = 1
        else:
            lbl = 0
        if random.randint(0, 9) == 0:
            test_instances.append(instance)
            test_labels.append(lbl)
        else:
            training_instances.append(instance)
            training_labels.append(lbl)

print(training_instances)
def setup_reviews():
    global documents
    documents += [(find_features(list(movie_reviews.words(fileid))), category)
                  for category in movie_reviews.categories()
                  for fileid in movie_reviews.fileids(category)]
Exemple #57
0
def main():

    DOC_SIZE = 1000
    TRAIN_SIZE = int(DOC_SIZE * 0.75)

    pos_files = movie_reviews.fileids(categories='pos')[:DOC_SIZE]
    neg_files = movie_reviews.fileids(categories='neg')[:DOC_SIZE]

    train_pos_files = pos_files[:TRAIN_SIZE]
    train_neg_files = neg_files[:TRAIN_SIZE]

    test_pos_files = pos_files[TRAIN_SIZE:] 
    test_neg_files = neg_files[TRAIN_SIZE:] 

    print('Corpus Size: {}'.format(len(pos_files + neg_files)))
    print('Training Size:\n\tpositive: {}\tnegative: {}'.format(len(train_pos_files), len(train_neg_files)))
    print('Testing Size:\n\tpositive: {}\tnegative: {}'.format(len(test_pos_files), len(test_neg_files)))
    print()

    # training datasets
    datasets = create_bunch(train_pos_files + train_neg_files)
    text_train, y_train = datasets.data, datasets.target

    # testing datasets
    datasets = create_bunch(test_pos_files + test_neg_files)
    text_test, y_test = datasets.data, datasets.target

    # vectorize training and testing data sets
    vectorizer = CountVectorizer(min_df=5, ngram_range=(2, 2))
    x_train = vectorizer.fit(text_train).transform(text_train)
    x_test = vectorizer.transform(text_test)
    
    # vocabulary
    features = vectorizer.get_feature_names()
    # print(features)
    param_grid = {'C': [0.001, 0.01, 0.1, 1, 10]}
    grid = GridSearchCV(LogisticRegression(solver='lbfgs', max_iter=200), param_grid, cv=5)
    grid.fit(x_train, y_train)

    lr = grid.best_estimator_
    lr.fit(x_train, y_train)
    lr.predict(x_test)

    print("Accuracy score: {:.2f}".format(lr.score(x_test, y_test)))
    print()
    print()

    # predictions
    print('Predicting movie reviews using Logical Regression classifier:')
    print('prediction:')
    print('[0] => negative\n[1] => positive')
    print()

    test_datasets = test_pos_files + test_neg_files
    random.shuffle(test_datasets)

    for i in range(10):
        r = random.randint(0, len(test_datasets) - 1)
        f = test_datasets[r]
        actual = movie_reviews.categories(f)
        raw = [movie_reviews.raw(f)]
        predict = lr.predict(vectorizer.transform(raw))
        print('Test doc: {}\t\tactual class: {}\t\tprediction: {}'.format(r, actual, predict))
Exemple #58
0
 def getDocuments(self):
   categories = movie_reviews.categories()
   documents = [(fileid, category) 
                   for category in categories
                   for fileid in movie_reviews.fileids(category)]
   return documents
#         predict=classifier.classify(gender_features(name))
#         if(predict!=tag):
#             errors.append((name,tag,predict))
#     errors.sort()
#     for name,tag, predict in errors[:10]:
#         print("name=%-30s "%name+"tag=%-8s"%tag+"predict=%-8s"%predict)

# ###########################################################################
# 文档分类模型的建立                                                         #
#
#############################################################################
#建立文档 这里使用movie文档中的内容,对文档进行正向和负向的评价
from nltk.corpus import movie_reviews
import random
document = [(list(movie_reviews.words(fileid)), tag)
            for tag in movie_reviews.categories()
            for fileid in movie_reviews.fileids(tag)]
random.shuffle(document)
# print(document[:10])
#构建特征,如何构建呢?最简单的构建方式是改词是否在words库中出现过,所以就是one-hot模型
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = list(all_words.keys())[:2000]


def get_feature_doc(document):
    words = set(document)
    features = {}
    for word in word_features:
        features["contains{0}".format(word)] = word in words
    return features
Python 3.8.2 (tags/v3.8.2:7b3ab59, Feb 25 2020, 23:03:10) [MSC v.1916 64 bit (AMD64)] on win32
Type "help", "copyright", "credits" or "license()" for more information.
>>> from nltk.corpus import movie_reviews
>>> print (len(movie_reviews.fileids())) #total reviews
2000
>>> print (movie_reviews.categories()) #review categories
['neg', 'pos']
>>> print (len(movie_reviews.fileids('pos'))) #pos reviews
1000
>>> print (len(movie_reviews.fileids('neg'))) #neg reviews
1000
>>> positive_review_file = movie_reviews.fileids('pos')[0]
>>> print (positive_review_file)
pos/cv000_29590.txt
>>> documents = [] #creating a movie review document
>>> for category in movie_reviews.categories():
	for fileid in movie_reviews.fileids(category):
		documents.append((movie_reviews.words(fileid), category))

		
>>> print (len(documents))
2000
>>> x = [str(item) for item in documents[0][0]]
>>> print(x)
['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party', ',', 'drink', 'and', 'then', 'drive', '.', 'they', 'get', 'into', 'an', 'accident', '.', 'one', 'of', 'the', 'guys', 'dies', ',', 'but', 'his', 'girlfriend', 'continues', 'to', 'see', 'him', 'in', 'her', 'life', ',', 'and', 'has', 'nightmares', '.', 'what', "'", 's', 'the', 'deal', '?', 'watch', 'the', 'movie', 'and', '"', 'sorta', '"', 'find', 'out', '.', '.', '.', 'critique', ':', 'a', 'mind', '-', 'f**k', 'movie', 'for', 'the', 'teen', 'generation', 'that', 'touches', 'on', 'a', 'very', 'cool', 'idea', ',', 'but', 'presents', 'it', 'in', 'a', 'very', 'bad', 'package', '.', 'which', 'is', 'what', 'makes', 'this', 'review', 'an', 'even', 'harder', 'one', 'to', 'write', ',', 'since', 'i', 'generally', 'applaud', 'films', 'which', 'attempt', 'to', 'break', 'the', 'mold', ',', 'mess', 'with', 'your', 'head', 'and', 'such', '(', 'lost', 'highway', '&', 'memento', ')', ',', 'but', 'there', 'are', 'good', 'and', 'bad', 'ways', 'of', 'making', 'all', 'types', 'of', 'films', ',', 'and', 'these', 'folks', 'just', 'didn', "'", 't', 'snag', 'this', 'one', 'correctly', '.', 'they', 'seem', 'to', 'have', 'taken', 'this', 'pretty', 'neat', 'concept', ',', 'but', 'executed', 'it', 'terribly', '.', 'so', 'what', 'are', 'the', 'problems', 'with', 'the', 'movie', '?', 'well', ',', 'its', 'main', 'problem', 'is', 'that', 'it', "'", 's', 'simply', 'too', 'jumbled', '.', 'it', 'starts', 'off', '"', 'normal', '"', 'but', 'then', 'downshifts', 'into', 'this', '"', 'fantasy', '"', 'world', 'in', 'which', 'you', ',', 'as', 'an', 'audience', 'member', ',', 'have', 'no', 'idea', 'what', "'", 's', 'going', 'on', '.', 'there', 'are', 'dreams', ',', 'there', 'are', 'characters', 'coming', 'back', 'from', 'the', 'dead', ',', 'there', 'are', 'others', 'who', 'look', 'like', 'the', 'dead', ',', 'there', 'are', 'strange', 'apparitions', ',', 'there', 'are', 'disappearances', ',', 'there', 'are', 'a', 'looooot', 'of', 'chase', 'scenes', ',', 'there', 'are', 'tons', 'of', 'weird', 'things', 'that', 'happen', ',', 'and', 'most', 'of', 'it', 'is', 'simply', 'not', 'explained', '.', 'now', 'i', 'personally', 'don', "'", 't', 'mind', 'trying', 'to', 'unravel', 'a', 'film', 'every', 'now', 'and', 'then', ',', 'but', 'when', 'all', 'it', 'does', 'is', 'give', 'me', 'the', 'same', 'clue', 'over', 'and', 'over', 'again', ',', 'i', 'get', 'kind', 'of', 'fed', 'up', 'after', 'a', 'while', ',', 'which', 'is', 'this', 'film', "'", 's', 'biggest', 'problem', '.', 'it', "'", 's', 'obviously', 'got', 'this', 'big', 'secret', 'to', 'hide', ',', 'but', 'it', 'seems', 'to', 'want', 'to', 'hide', 'it', 'completely', 'until', 'its', 'final', 'five', 'minutes', '.', 'and', 'do', 'they', 'make', 'things', 'entertaining', ',', 'thrilling', 'or', 'even', 'engaging', ',', 'in', 'the', 'meantime', '?', 'not', 'really', '.', 'the', 'sad', 'part', 'is', 'that', 'the', 'arrow', 'and', 'i', 'both', 'dig', 'on', 'flicks', 'like', 'this', ',', 'so', 'we', 'actually', 'figured', 'most', 'of', 'it', 'out', 'by', 'the', 'half', '-', 'way', 'point', ',', 'so', 'all', 'of', 'the', 'strangeness', 'after', 'that', 'did', 'start', 'to', 'make', 'a', 'little', 'bit', 'of', 'sense', ',', 'but', 'it', 'still', 'didn', "'", 't', 'the', 'make', 'the', 'film', 'all', 'that', 'more', 'entertaining', '.', 'i', 'guess', 'the', 'bottom', 'line', 'with', 'movies', 'like', 'this', 'is', 'that', 'you', 'should', 'always', 'make', 'sure', 'that', 'the', 'audience', 'is', '"', 'into', 'it', '"', 'even', 'before', 'they', 'are', 'given', 'the', 'secret', 'password', 'to', 'enter', 'your', 'world', 'of', 'understanding', '.', 'i', 'mean', ',', 'showing', 'melissa', 'sagemiller', 'running', 'away', 'from', 'visions', 'for', 'about', '20', 'minutes', 'throughout', 'the', 'movie', 'is', 'just', 'plain', 'lazy', '!', '!', 'okay', ',', 'we', 'get', 'it', '.', '.', '.', 'there', 'are', 'people', 'chasing', 'her', 'and', 'we', 'don', "'", 't', 'know', 'who', 'they', 'are', '.', 'do', 'we', 'really', 'need', 'to', 'see', 'it', 'over', 'and', 'over', 'again', '?', 'how', 'about', 'giving', 'us', 'different', 'scenes', 'offering', 'further', 'insight', 'into', 'all', 'of', 'the', 'strangeness', 'going', 'down', 'in', 'the', 'movie', '?', 'apparently', ',', 'the', 'studio', 'took', 'this', 'film', 'away', 'from', 'its', 'director', 'and', 'chopped', 'it', 'up', 'themselves', ',', 'and', 'it', 'shows', '.', 'there', 'might', "'", 've', 'been', 'a', 'pretty', 'decent', 'teen', 'mind', '-', 'f**k', 'movie', 'in', 'here', 'somewhere', ',', 'but', 'i', 'guess', '"', 'the', 'suits', '"', 'decided', 'that', 'turning', 'it', 'into', 'a', 'music', 'video', 'with', 'little', 'edge', ',', 'would', 'make', 'more', 'sense', '.', 'the', 'actors', 'are', 'pretty', 'good', 'for', 'the', 'most', 'part', ',', 'although', 'wes', 'bentley', 'just', 'seemed', 'to', 'be', 'playing', 'the', 'exact', 'same', 'character', 'that', 'he', 'did', 'in', 'american', 'beauty', ',', 'only', 'in', 'a', 'new', 'neighborhood', '.', 'but', 'my', 'biggest', 'kudos', 'go', 'out', 'to', 'sagemiller', ',', 'who', 'holds', 'her', 'own', 'throughout', 'the', 'entire', 'film', ',', 'and', 'actually', 'has', 'you', 'feeling', 'her', 'character', "'", 's', 'unraveling', '.', 'overall', ',', 'the', 'film', 'doesn', "'", 't', 'stick', 'because', 'it', 'doesn', "'", 't', 'entertain', ',', 'it', "'", 's', 'confusing', ',', 'it', 'rarely', 'excites', 'and', 'it', 'feels', 'pretty', 'redundant', 'for', 'most', 'of', 'its', 'runtime', ',', 'despite', 'a', 'pretty', 'cool', 'ending', 'and', 'explanation', 'to', 'all', 'of', 'the', 'craziness', 'that', 'came', 'before', 'it', '.', 'oh', ',', 'and', 'by', 'the', 'way', ',', 'this', 'is', 'not', 'a', 'horror', 'or', 'teen', 'slasher', 'flick', '.', '.', '.', 'it', "'", 's', 'just', 'packaged', 'to', 'look', 'that', 'way', 'because', 'someone', 'is', 'apparently', 'assuming', 'that', 'the', 'genre', 'is', 'still', 'hot', 'with', 'the', 'kids', '.', 'it', 'also', 'wrapped', 'production', 'two', 'years', 'ago', 'and', 'has', 'been', 'sitting', 'on', 'the', 'shelves', 'ever', 'since', '.', 'whatever', '.', '.', '.', 'skip', 'it', '!', 'where', "'", 's', 'joblo', 'coming', 'from', '?', 'a', 'nightmare', 'of', 'elm', 'street', '3', '(', '7', '/', '10', ')', '-', 'blair', 'witch', '2', '(', '7', '/', '10', ')', '-', 'the', 'crow', '(', '9', '/', '10', ')', '-', 'the', 'crow', ':', 'salvation', '(', '4', '/', '10', ')', '-', 'lost', 'highway', '(', '10', '/', '10', ')', '-', 'memento', '(', '10', '/', '10', ')', '-', 'the', 'others', '(', '9', '/', '10', ')', '-', 'stir', 'of', 'echoes', '(', '8', '/', '10', ')']
>>> print (documents[0])
(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], 'neg')
>>> from random import shuffle
>>> shuffle(documents) #shuffle the document list
>>>