Exemple #1
0
    def __init__(self, pos=None, neg=None):
        if not pos:
            # self.__pos = [open(f).read() for f in glob('review_polarity/txt_sentoken/pos/*.txt')]
            self.__pos = [
                movie_reviews.raw(file)
                for file in movie_reviews.fileids('pos')
            ]
        else:
            self.__pos = pos
        if not neg:
            # self.__neg = [open(f).read() for f in glob('review_polarity/txt_sentoken/neg/*.txt')]
            self.__neg = [
                movie_reviews.raw(file)
                for file in movie_reviews.fileids('neg')
            ]
        else:
            self.__neg = neg

        if os.path.isfile('classifier.pickle'):
            # Load the features
            with open('classifier.pickle', 'rb') as f:
                self.__classifier = pickle.load(f)
        else:
            # Train a data set
            self.__classifier = nltk.NaiveBayesClassifier.train(
                self.__train_data())

            # Cache the features for faster predictions
            with open('classifier.pickle', 'wb') as f:
                pickle.dump(self.__classifier, f)
def get_sentilyzer():
    """
    train and return the sentiment analyzer
    """
    from nltk.corpus import movie_reviews

    try:
        classifier = pickle.load(open("data/sentilyzer.pickle"))
        return classifier
    except:
        print "Unable to load sentilyzer, so training it again"

    negids = movie_reviews.fileids('neg')
    posids = movie_reviews.fileids('pos')

    negfeats = [(extract_features(movie_reviews.raw(fileids=[f])), 'neg') for f in negids]
    posfeats = [(extract_features(movie_reviews.raw(fileids=[f])), 'pos') for f in posids]
    
    print "Length of Negative Features", len(negfeats)
    print "Length of Positive Features", len(posfeats)

    negcutoff = int(len(negfeats) * 3 / 4)
    poscutoff = int(len(posfeats) * 3 / 4)

    trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
    testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
    print 'train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats))

    classifier = nltk.NaiveBayesClassifier.train(trainfeats)
    print 'accuracy of sentiment analysis:', nltk.classify.util.accuracy(classifier, testfeats)
    pickle.dump(classifier, open("data/sentilyzer.pickle", "w"))
    return classifier
Exemple #3
0
def load_reviews():
    """ Load movie reviews from nltk and split into train, dev and test."""
    negids = movie_reviews.fileids('neg')
    posids = movie_reviews.fileids('pos')
    reviews = [TextDocument(movie_reviews.raw(fileids=[id]), id, 1) for id in posids] + \
              [TextDocument(movie_reviews.raw(fileids=[id]), id, -1) for id in negids]
    # Get reproducible data split by setting a deterministic seed for the random number generator.
    """random. random ( )
    Return the next random floating point number in the range [0.0, 1.0).
    
    """   #?????????
    random.Random(0).shuffle(reviews)  #Random(0)

    # First 60% of data is for training.
    start_dev = int(
        0.6 * len(reviews))  #integervalue: um die position von start von dev
    # Next 20% is for development (hyper-parameter tuning).
    start_test = int(0.8 * len(reviews))
    # ... last 20% are for testing.

    training_collection = DocumentCollection.from_document_list(
        reviews[:start_dev])
    dev_collection = DocumentCollection.from_document_list(
        reviews[start_dev:start_test])
    test_collection = DocumentCollection.from_document_list(
        reviews[start_test:])
    return training_collection, dev_collection, test_collection
Exemple #4
0
def nltk_data(n_texts_train=1500, n_texts_dev=500, vocab_size=10000):
    """
    Reads texts from the nltk movie_reviews corpus. A word2id dictionary is 
    created and the words in the texts are substituted with their numbers. Training
    and Development data is returned, together with labels and the word2id dictionary.
 
    :param n_texts_train: the number of reviews that will form the training data
    :param n_texts_dev: the number of reviews that will form the development data
    :param vocab_size: the maximum size of the vocabulary.

    :return list texts_train: A list containing lists of wordids corresponding to 
    training texts.
    :return list texts_dev: A list containing lists of wordids corresponding to 
    development texts.
    :return labels_train: A list containing the labels (0 or 1) for the corresponding
    text entry in texts_train
    :return labels_dev: A ilst containing the labels (0 or 1) for the corresponding
    text entry in texts_dev
    :return word2id: The dictionary obtained from the training texts that maps each
    seen word to an id.
    """
    all_ids = movie_reviews.fileids()
    if (n_texts_train + n_texts_dev > len(all_ids)):
        print(
            "Error: There are only", len(all_ids),
            "texts in the movie_reviews corpus. Training with all of those sentences."
        )
        n_texts_train = 1500
        n_texts_dev = 500
    posids = movie_reviews.fileids('pos')
    random.shuffle(all_ids)

    texts_train = []
    labels_train = []
    texts_dev = []
    labels_dev = []

    for i in range(n_texts_train):
        text = movie_reviews.raw(fileids=[all_ids[i]])
        tokens = [word.lower() for word in word_tokenize(text)]
        texts_train.append(tokens)
        if all_ids[i] in posids:
            labels_train.append(1)
        else:
            labels_train.append(0)

    for i in range(n_texts_train, n_texts_train + n_texts_dev):
        text = movie_reviews.raw(fileids=[all_ids[i]])
        tokens = [word.lower() for word in word_tokenize(text)]
        texts_dev.append(tokens)
        if all_ids[i] in posids:
            labels_dev.append(1)
        else:
            labels_dev.append(0)

    word2id = create_dictionary(texts_train, vocab_size)
    texts_train = [to_ids(s, word2id) for s in texts_train]
    texts_dev = [to_ids(s, word2id) for s in texts_dev]
    return (texts_train, labels_train, texts_dev, labels_dev, word2id)
Exemple #5
0
def to_arr():
    neg_indexes = movie_reviews.fileids('neg')
    pos_indexes = movie_reviews.fileids('pos')
    neg_reviews = [movie_reviews.raw(fileids=ids) for ids in neg_indexes]
    pos_reviews = [movie_reviews.raw(fileids=ids) for ids in pos_indexes]
    list_of_text = stemmer(neg_reviews + pos_reviews)
    cv = CountVectorizer()
    word_arr = cv.fit_transform(list_of_text).toarray()
    bin_arr = np.array([0] * len(neg_reviews) + [1] * len(pos_indexes))
    return (word_arr, bin_arr)
Exemple #6
0
def f_i_and_c():
    neg_file_ids = movie_reviews.fileids('neg')
    pos_file_ids = movie_reviews.fileids('pos')
    neg_reviews = [movie_reviews.raw(fileids=ids) for ids in neg_file_ids]
    pos_reviews = [movie_reviews.raw(fileids=ids) for ids in pos_file_ids]
    list_of_text = stem_list_of_text(neg_reviews + pos_reviews)
    cv = CountVectorizer()
    f_i = cv.fit_transform(list_of_text).toarray()
    c = np.array([0] * len(neg_reviews) + [1] * len(pos_file_ids))
    return (f_i, c)
Exemple #7
0
    def opinion_features(fileid):
        """ starter feature engineering for movie reviews... """
        # many features are counts!
        positive_count = 0
        negative_count = 0
        exclaim_count = 0
        for word in movie_reviews.words(fileid):
            if word in pos_set:
                positive_count += 1
            if word in neg_set:
                negative_count += 1
        #'''
        for word in movie_reviews.words(fileid):
            for x in range(len(word)):
                if word[x] == '!':
                    exclaim_count += 1
        asum = 0
        aword = 0
        for word in movie_reviews.words(fileid):
            asum += len(word)
            aword += 1
        avgwordlen = asum / aword
        subjectivity = 0
        rawtext = movie_reviews.raw(fileid)
        blob = textblob.TextBlob(rawtext)
        for x in blob.sentences:
            subjectivity += x.sentiment.subjectivity
        #'''

        # Note:  movie_reviews.raw(fileid) is the whole review!
        # these next two lines will create
        # a TextBlob with all of the text from the review:
        rawtext = movie_reviews.raw(fileid)
        TB = textblob.TextBlob(rawtext)
        # from here, you can use TB.words and TB.sentences...

        # here is the dictionary of features...
        features = {}  # we could also use a default dictionary!

        features['positive'] = positive_count
        features['negative'] = negative_count
        #'''
        features['exclaimation mark'] = exclaim_count
        features['average word length'] = avgwordlen
        features['number of words'] = aword
        features['subjectivity'] = subjectivity
        #'''
        return features
def load_movie_documents():
    """
    Returns a list where each entry is a pair (cat, [token, ...])
    """
    pairs = []
    for fileid in nltk.corpus.movie_reviews.fileids():
        # get the raw text
        text = movie_reviews.raw(fileid)
        category = "pos" if fileid.startswith("pos/") else "neg"

        document = []
        # We can just split the string on spaces, since it's already been
        # preprocessed.
        document = text.split()

        # The on-disk data has already been sentence split and tokenized!
        # So we don't need to do any of this! But if we got raw normal text, we
        # might want to do something like the following.
        ## # get all the sentences
        ## for sentence in nltk.sent_tokenize(text):
        ##     tokens = nltk.word_tokenize(sentence)
        ##     # now we have a list of tokens in "tokens"
        ##     document.extend(tokens)

        pairs.append((category, document))
    return pairs
Exemple #9
0
 def test_classify_movies(self):
     """Classify movie reviews from nltk corpus"""
     from nltk.corpus import movie_reviews
     docs = [Document(raw_text=movie_reviews.raw(fileid), class_label=category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]
     print docs
Exemple #10
0
 def get_movie_reviews(cls):
     documents = [(category, movie_reviews.raw(fileid))
                  for category in movie_reviews.categories()
                  for fileid in movie_reviews.fileids(category)]
     random.shuffle(documents)
     l, d = zip(*documents)
     return l, d
def read_reviews():
    documents = []

    for file_id in movie_reviews.fileids():
        documents.append(movie_reviews.raw(file_id))

    return documents
Exemple #12
0
def main():
    vectorizer = TfidfVectorizer(
        use_idf=True,
        # norm=None,
        # smooth_idf=False,
        # sublinear_tf=False,
        # binary=False,
        # min_df=1,
        # max_df=1.0, max_features=None,
        # strip_accents='unicode',
        # ngram_range=(1,1), preprocessor=None,
        stop_words='english',
        tokenizer=None,
        vocabulary=None)
    # lots of options to play around with.few useful options I found were norm, min_df and max_df.

    for type in movie_reviews.categories():
        # only 2 categories : 'pos' and 'neg'
        type_ids = movie_reviews.fileids(type)
        X = vectorizer.fit_transform(
            list(movie_reviews.raw(t) for t in type_ids))
        idf = vectorizer.idf_

        # once we get weights, we just arrange it in decreasing sort
        wts = dict(zip(vectorizer.get_feature_names(), idf))
        s_wts = [(k, wts[k]) for k in sorted(wts, key=wts.get, reverse=True)]
        for key, val in s_wts[:10]:
            print(type, key, val)
Exemple #13
0
    def prep_reviews_data(self): # messy code to test classifier with movie reviews
        if not self.movie_review_data:
            print 'Preparing movie reviews...\n'
            from nltk.corpus import movie_reviews
            docs = [movie_reviews.raw(fileid) 
                    for category in movie_reviews.categories() 
                    for fileid in movie_reviews.fileids(category)]

            process = lambda x: 1 if x == 'pos' else -1
            labels = [process(category)
                    for category in movie_reviews.categories() 
                    for fileid in movie_reviews.fileids(category)]

            docs, labels = double_shuffle(docs, labels)
            training, testing = divide_list_by_ratio(docs)
            self.train_labs, self.test_labs = divide_list_by_ratio(labels)

            train_vecs = self.vectorizer.fit_transform(training)
            test_vecs = self.vectorizer.transform(testing)

            if isinstance(self.model, naive_bayes.GaussianNB):
                train_vecs = train_vecs.toarray()
                test_vecs = test_vecs.toarray()

            self.train_vecs = train_vecs
            self.test_vecs = test_vecs

            self.movie_review_data = True
            self.news_market_data = False
Exemple #14
0
def get_documents():
    """
    Get documents from 20 News Groups, Movie Reviews and Reuters corpora.
    
    Returns:
        list of str: Small subset of documents from News Groups, Movie Reviews 
            and Reuters corpora
    """
    dataset = fetch_20newsgroups(subset='all',
                                 shuffle=True,
                                 remove=('headers', 'footers', 'quotes'))
    corpus_20newsgroups = dataset.data[:5]

    tuples = [(movie_reviews.raw(fileid), category)
              for category in movie_reviews.categories()
              for fileid in movie_reviews.fileids(category)]

    corpus_movies = [tuple_[0] for tuple_ in tuples]
    shuffle(corpus_movies)
    corpus_movies = corpus_movies[:5]

    tuples = [(reuters.raw(fileid), reuters.categories(fileid))
              for fileid in reuters.fileids()]
    corpus_reuters = [tuple_[0] for tuple_ in tuples]
    shuffle(corpus_reuters)
    corpus_reuters = corpus_reuters[:5]

    corpus = list()
    corpus.extend(corpus_20newsgroups)
    corpus.extend(corpus_movies)
    corpus.extend(corpus_reuters)

    return corpus
def read_reviews():
    documents = []
 
    for category in movie_reviews.categories():
        for fileid in movie_reviews.fileids(category):
            documents.append((movie_reviews.words(fileid), category, movie_reviews.raw(fileid)))

    return documents
Exemple #16
0
 def __init__(self, pos=None, neg=None):
     if not pos:
         # self.__pos = [open(f).read() for f in glob('review_polarity/txt_sentoken/pos/*.txt')]
         self.__pos = [
             movie_reviews.raw(file)
             for file in movie_reviews.fileids('pos')
         ]
     else:
         self.__pos = pos
     if not neg:
         # self.__neg = [open(f).read() for f in glob('review_polarity/txt_sentoken/neg/*.txt')]
         self.__neg = [
             movie_reviews.raw(file)
             for file in movie_reviews.fileids('neg')
         ]
     else:
         self.__neg = neg
def get_labeled_dataset():
    dataset = []
    for label in movie_reviews.categories():
        for review in movie_reviews.fileids(label):
            dataset.append((movie_reviews.raw(review), label))

    random.shuffle(dataset)
    return dataset
Exemple #18
0
    def opinion_features(fileid):
        """ starter feature engineering for movie reviews... """

        rawtext = movie_reviews.raw(fileid)
        TB = textblob.TextBlob(rawtext)

        total_words = len(TB.words)
        total_sentence = len(TB.sentences)
        positive_count = 0
        negative_count = 0
        for i in range(len(TB.words)):
            if TB.words[i] in pos_set:
                if TB.words[i - 1] in [
                        'not', 'less', 'few', "isn't", "hasn't", "wasn't"
                ] or TB.words[i - 2] == 'not':
                    negative_count += 1
                else:
                    positive_count += 1

            elif TB.words[i] in neg_set:
                if TB.words[i - 1] in [
                        'not', 'less', 'few', "isn't", "hasn't", "wasn't"
                ] or TB.words[i - 2] == 'not':
                    positive_count += 1
                else:
                    negative_count += 1

        # Note:  movie_reviews.raw(fileid) is the whole review!
        # create a TextBlob with
        rawtext = movie_reviews.raw(fileid)
        TB = textblob.TextBlob(rawtext)
        # now, you can use TB.words and TB.sentences...

        # here is the dictionary of features...
        features = {}  # could also use a default dictionary!

        features['positive'] = positive_count
        features['negative'] = negative_count
        features['total words'] = total_words
        features['total sentence'] = total_sentence
        features['sentimence'] = TB.sentiment.subjectivity
        features['polarity'] = TB.sentiment.polarity
        #features['negative_r'] = negative_count//total_words
        #features['positive_r'] = positive_count//total_words
        return features
Exemple #19
0
def read_reviews():
    documents = []
    labels = []

    for file_id in movie_reviews.fileids():
        documents.append(movie_reviews.raw(file_id))
        labels.append(movie_reviews.categories(file_id)[0])

    return documents, labels
def get_raw_text(corpus,file_name):
    string=''
    if corpus=='mr':
        from nltk.corpus import movie_reviews
        string = movie_reviews.raw(fileids=file_name)
    else:
        from nltk.corpus import reuters
        string = reuters.raw(fileids=file_name)
    return string
Exemple #21
0
def loadData():
    print('loading the dataset')

    dataset = [(list(word_tokenize(movie_reviews.raw(fileid))), category)
               for category in movie_reviews.categories()
               for fileid in movie_reviews.fileids(category)]

    print('loading is  completed')
    return dataset
Exemple #22
0
def get_raw_text(corpus, file_name):
    string = ''
    if corpus == 'mr':
        from nltk.corpus import movie_reviews
        string = movie_reviews.raw(fileids=file_name)
    else:
        from nltk.corpus import reuters
        string = reuters.raw(fileids=file_name)
    return string
def get_list_tokens_nltk(corpus, file_name):
    string=''
    if corpus=='mr':
        from nltk.corpus import movie_reviews
        string = movie_reviews.raw(fileids=file_name)
    else:
        from nltk.corpus import reuters
        string = reuters.raw(fileids=file_name)
    list_words = re.split(r'\W+',string)
    return [w.lower() for w in list_words if w.isalpha() and len(w)>1 and w.lower() not in stopwords_english]
Exemple #24
0
def loadReview(f, sw):
    words = {}
    review = movie_reviews.raw(f)   #load the movie review
    for w in review.split():     #for each word in the review
        if w in sw:              #If the word is one of our stop words
            if w in words:
                words[w] = words[w] + 1
            else:
                words[w] = 1
    return words
Exemple #25
0
def loadReview(f, sw):
    words = {}
    review = movie_reviews.raw(f)  #load the movie review
    for w in review.split():  #for each word in the review
        if w in sw:  #If the word is one of our stop words
            if w in words:
                words[w] = words[w] + 1
            else:
                words[w] = 1
    return words
Exemple #26
0
def create_bunch(col):
    bunch_data = []
    bunch_target = []
    for f in col:
        bunch_data.append(movie_reviews.raw(f))
        if (movie_reviews.categories(f)[0] == 'pos'):
            cat = 1
        else:
            cat = 0
        bunch_target.append(cat)
    return sklearn.utils.Bunch(data=bunch_data, target=bunch_target)
Exemple #27
0
    def get_model(self):
        filename = 'dump.pkl'
        if os.path.isfile(filename):
            with open(filename, 'rb') as f:
                model = pickle.load(f)
        else:
            X = [reviews.raw(fileid) for fileid in reviews.fileids()]
            y = [reviews.categories(fileid)[0] for fileid in reviews.fileids()]
            model = build_and_evaluate(X, y, outpath=filename)

        return model
Exemple #28
0
def load_moview_reviews(shuffle=True):
    X = [movie_reviews.raw(fileid) for fileid in movie_reviews.fileids()]
    y = [
        movie_reviews.categories(fileid)[0]
        for fileid in movie_reviews.fileids()
    ]
    Xy = zip(X, y)
    if shuffle:
        random.shuffle(Xy, )
    else:
        random.shuffle(Xy, lambda: 0.42)
    return [x[0] for x in Xy], [x[1] for x in Xy]
Exemple #29
0
def main():
    """
    Sample training using the movie reviews corpus (Pang, Lee).
    """

    #== load inputs
    documents = np.array([
        movie_reviews.raw(review_id)
        for category in movie_reviews.categories()
        for review_id in movie_reviews.fileids(category)
    ])

    sentiment_scores = np.array([
        0 if category == 'neg' else 1
        for category in movie_reviews.categories()
        for review_id in movie_reviews.fileids(category)
    ])

    #== select random indices
    n = documents.shape[0]
    indices = np.random.permutation(n)
    threshold = np.floor(n * 0.8)  # 80% training set / 20% test set
    train_idx, test_idx = indices[:threshold], indices[threshold:]

    #== select training and validation sets according to these indicies
    x_train, x_test = documents[:, train_idx], documents[:, test_idx]
    y_train, y_test = sentiment_scores[:,
                                       train_idx], sentiment_scores[:,
                                                                    test_idx]

    #== train the model
    print '===== Training the model...'
    sentiment = SentimentMachine(x_train.tolist(), y_train.tolist())
    w = sentiment.train(speed=0.001, stochastic=False)
    print '===== Model trained.'

    #== test efficiency of the model
    print '===== Testing the model...'
    # compute the MSE
    h = lambda a, b: sigmoid(np.dot(a, b))
    x = sentiment.compute_features_matrix(x_test.tolist())
    mse = cost(w, x, y_test, h)
    # compute the number of valid classifications
    n_test = y_test.shape[0]
    valid = 0
    for i in xrange(n_test):
        valid += 1 if sentiment.classify(x_test[i]) == y_test[i] else 0
    percent = 100.0 * valid / n_test
    # print results
    print('== Number of well-classified documents: {0} / {1} ({2}%)'.format(
        valid, n_test, percent))
    print '== Cost value on the test set: %.4f' % mse
def get_list_tokens_nltk(corpus, file_name):
    string = ''
    if corpus == 'mr':
        from nltk.corpus import movie_reviews
        string = movie_reviews.raw(fileids=file_name)
    else:
        from nltk.corpus import reuters
        string = reuters.raw(fileids=file_name)
    list_words = re.split(r'\W+', string)
    return [
        w.lower() for w in list_words
        if w.isalpha() and len(w) > 1 and w.lower() not in stopwords_english
    ]
Exemple #31
0
 def add_document_to_corpus(self, filename=None):
     if filename:
         inp_file = open(filename, 'r')
     else:
         inp_file = movie_reviews.raw()
     words = self.get_tokens(inp_file)
     words_freq = Counter(words)
     self.num_words += len(words_freq)
     for word in words_freq:
         if word in self.term_freq:
             self.term_freq[word] += words_freq[word]
         else:
             self.term_freq[word] = words_freq[word]
Exemple #32
0
 def movieReviews(self, category, count):
   ret = []
   if category != 'positive' and category != 'negative':
     return ret
   fileids = []
   if category == 'positive':
     fileids = movie_reviews.fileids('pos')
   elif category == 'negative':
     fileids = movie_reviews.fileids('neg')
   sampleFileIds = sample(fileids, count)
   for sampleFileId in sampleFileIds:
     ret.append(movie_reviews.raw(sampleFileId))
   return ret
Exemple #33
0
	def add_document_to_corpus(self, filename = None):
		if filename:
			inp_file = open(filename,'r')
		else:
			inp_file = movie_reviews.raw()
   		words = self.get_tokens(inp_file)
		words_freq = Counter(words)
		self.num_words += len(words_freq)
 		for word in words_freq:
      			if word in self.term_freq:
        			self.term_freq[word] += words_freq[word]
      			else:
        			self.term_freq[word] = words_freq[word]			
    def data_preprocessed(self):

        syms = '[]()"'
        data = movie_reviews.raw().lower()
        for sym in list(syms) + ['...']:
            data = data.replace(sym, '')

        corpus = data[:self.data_size]
        self.vocab = set(self.tokens_filter(corpus) + [BEGIN, END])
        self.vocab_len = len(self.vocab) + 1
        self.token2id = {t: i for i, t in enumerate(self.vocab, start=1)}
        self.id2token = {i: t for i, t in enumerate(self.vocab, start=1)}

        return corpus
def load_data_and_labels():
    # Load data from files

    # positive_examples = list(open("./data/rt-polaritydata/rt-polarity.pos", "r").readlines())
    # positive_examples = [s.strip() for s in positive_examples]
    positive_examples = movie_reviews.raw(fileid for fileid in movie_reviews.fileids('pos'))
    positive_examples = sent_tokenize(positive_examples)


    # negative_examples = list(open("./data/rt-polaritydata/rt-polarity.neg", "r").readlines())
    # negative_examples = [s.strip() for s in negative_examples]
    negative_examples = movie_reviews.raw(fileid for fileid in movie_reviews.fileids('neg'))
    negative_examples = sent_tokenize(negative_examples)

    # Split by words
    x_text = positive_examples + negative_examples
    x_text = [clean_str(sent) for sent in x_text]
    x_text = [s.split(" ") for s in x_text]
    # Generate labels
    positive_labels = [[0, 1] for _ in positive_examples]
    negative_labels = [[1, 0] for _ in negative_examples]
    y = np.concatenate([positive_labels, negative_labels], 0)
    return [x_text, y]
def load_movie_documents():
    """
    Returns a list where each entry is a pair (cat, [token, ...])
    """
    pairs = []
    for fileid in nltk.corpus.movie_reviews.fileids():
        # get the raw text
        text = movie_reviews.raw(fileid)
        category = "pos" if fileid.startswith("pos/") else "neg"

        document = []
        # We can just split the string on spaces, since it's already been
        # preprocessed.
        document = text.split()
        pairs.append((category, document))
    return pairs
Exemple #37
0
def main():
    """
    Sample training using the movie reviews corpus (Pang, Lee).
    """

    #== load inputs
    documents = np.array([movie_reviews.raw(review_id) 
        for category in movie_reviews.categories() 
        for review_id in movie_reviews.fileids(category)])

    sentiment_scores = np.array([0 if category == 'neg' else 1 
        for category in movie_reviews.categories() 
        for review_id in movie_reviews.fileids(category)])

    #== select random indices
    n = documents.shape[0]
    indices = np.random.permutation(n)
    threshold = np.floor(n*0.8) # 80% training set / 20% test set
    train_idx, test_idx = indices[:threshold], indices[threshold:]

    #== select training and validation sets according to these indicies
    x_train, x_test = documents[:, train_idx], documents[:, test_idx]
    y_train, y_test = sentiment_scores[:, train_idx], sentiment_scores[:, test_idx]

    #== train the model
    print '===== Training the model...'
    sentiment = SentimentMachine(x_train.tolist(), y_train.tolist())
    w = sentiment.train(speed=0.001, stochastic=False)
    print '===== Model trained.'

    #== test efficiency of the model
    print '===== Testing the model...'
    # compute the MSE
    h = lambda a,b: sigmoid(np.dot(a,b))
    x = sentiment.compute_features_matrix(x_test.tolist())
    mse = cost(w, x, y_test, h)
    # compute the number of valid classifications
    n_test = y_test.shape[0]
    valid = 0
    for i in xrange(n_test):
        valid += 1 if sentiment.classify(x_test[i]) == y_test[i] else 0
    percent = 100.0 * valid / n_test
    # print results
    print ('== Number of well-classified documents: {0} / {1} ({2}%)'
        .format(valid, n_test, percent))
    print '== Cost value on the test set: %.4f' % mse
__author__ = 'a_medelyan'

# Goal: Get movie reviews and read them
# See: http://www.nltk.org/book/ch02.html

from nltk.corpus import movie_reviews

# How many documents in this corpus?
print len(movie_reviews.fileids())

# What are the categories?
print movie_reviews.categories()

# What are some files names?
print movie_reviews.fileids('neg')[:10]
print movie_reviews.fileids('pos')[:10]

# Print the words in a sample text
print movie_reviews.words('pos/cv000_29590.txt')

# Print the original text
print movie_reviews.raw('pos/cv000_29590.txt')

# Print the sentences of the text
print movie_reviews.sents('pos/cv000_29590.txt')

# Spare time? Calculate the average number of words and sentences in positive and negative reviews
# Do people use a lot more words when giving positive vs. negative reviews?
import collections
import nltk.metrics
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from sklearn.naive_bayes import MultinomialNB


stopset = stopwords.words('english')

negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')

negdocs = [movie_reviews.raw(f) for f in negids]
posdocs = [movie_reviews.raw(f) for f in posids]
negtags=[0]*len(negdocs)
postags=[1]*len(posdocs)
                              
negcutoff = int(len(negdocs)*0.8)
poscutoff = int(len(posdocs)*0.8)

traindocs = negdocs[:negcutoff] + posdocs[:poscutoff]
traintags = negtags[:negcutoff] + postags[:poscutoff]
testdocs = negdocs[negcutoff:] + posdocs[poscutoff:]
testtags = negtags[negcutoff:] + postags[negcutoff:]
print 'train on %d instances, test on %d instances' % (len(traindocs), len(testdocs))


vectorizer = CountVectorizer(min_df=1, binary=True, stop_words=stopset)
Exemple #40
0
    # score these bigrams
    best_bigrams = bigram_finder.nbest(score_fn, n)

    # return boolean mapping, as before
    return dict([(ngram, True) for ngram in itertools.chain(words, best_bigrams)])

if __name__ == '__main__':
 

    # load corpus, will be different for corpora not included in nltk
    neg_ids = movie_reviews.fileids('neg')
    pos_ids = movie_reviews.fileids('pos')
     
    # bag-of-words features
    neg_feats = [(word_feats(movie_reviews.raw(fileids=[f])), 'neg') for f in neg_ids]
    pos_feats = [(word_feats(movie_reviews.raw(fileids=[f])), 'pos') for f in pos_ids]

    # uncomment for bag-of-bigram feats (make sure to comment out the preceding two lines)
    # neg_feats = [(bigram_feats(movie_reviews.raw(fileids=[f])), 'neg') for f in neg_ids]
    # pos_feats = [(bigram_feats(movie_reviews.raw(fileids=[f])), 'pos') for f in pos_ids]
     
    # create train / test split
    negcutoff = len(neg_feats)*3/4
    poscutoff = len(pos_feats)*3/4

    train_feats = neg_feats[:negcutoff] + pos_feats[:poscutoff]
    test_feats = neg_feats[negcutoff:] + pos_feats[poscutoff:]
    print 'train on %d instances, test on %d instances' % (len(train_feats), len(test_feats))
     
    # classify
Exemple #41
0
#!/usr/bin/env python

import re
from nltk.corpus import movie_reviews

documents = []
n = 10
for category in movie_reviews.categories():
	for fileid in movie_reviews.fileids(category):
		n -= 1
		if n <= 0: break
		documents.append(movie_reviews.raw(fileid))
		# print documents[-1]

for doc in documents:
	patterns = re.findall("^(t.*\')", doc)
	if len(patterns) != 0:
		print patterns[0]
print blob.sentiment

blob = TextBlob("I love this library")
print blob.sentiment

# Test on new movie reviews
transcendence = ['../data/transcendence_1star.txt', '../data/transcendence_5star.txt', '../data/transcendence_8star.txt',
                 '../data/transcendence_great.txt']

# Insert code here


# Spare time? Evaluate both ways of determining sentiment.
# Also test out various polarity thresholds.

correct = 0
for fileid in movie_reviews.fileids():
    raw = movie_reviews.raw(fileid)
    blob = TextBlob(raw)
    sentiment = blob.sentiment

    guessed = 'neg'
    if sentiment.polarity > 0.11:
        guessed = 'pos'

    actual = movie_reviews.categories(fileid)[0]
    if guessed == actual:
        correct += 1

accuracy = float(correct)/len(movie_reviews.fileids())
print accuracy
Exemple #43
0
vect.fit_transform(sample).toarray()
vect.get_feature_names()

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
tfidf.fit_transform(sample).toarray()
tfidf.get_feature_names()


'''
EXAMPLE: Automatically summarize a document
'''

# corpus of 2000 movie reviews
from nltk.corpus import movie_reviews
reviews = [movie_reviews.raw(filename) for filename in movie_reviews.fileids()]

# create document-term matrix
tfidf = TfidfVectorizer(stop_words='english')
dtm = tfidf.fit_transform(reviews)
features = tfidf.get_feature_names()

import numpy as np

# find the most and least "interesting" sentences in a randomly selected review
def summarize():
    
    # choose a random movie review    
    review_id = np.random.randint(0, len(reviews))
    review_text = reviews[review_id]
Exemple #44
0
  raw_text=re.sub('#', ' ', raw_text)
  raw_text=re.sub('^https?:\/\/.*[\r\n]*', ' ', raw_text)
  return raw_text

document_set=[]
st = LancasterStemmer()

from nltk.corpus import movie_reviews
for category in movie_reviews.categories():
  if category=='pos':
      cat=True
  else:
      cat=False
      
  for fileid in movie_reviews.fileids(category):
    raw_text = movie_reviews.raw(fileid)
    raw_text = filter_text(raw_text)
    tokens = nltk.word_tokenize(raw_text)
    #words=[st.stem(i) for i in list(tokens)]
    document_set.append((list(tokens), cat))

random.shuffle(document_set)


test_set=document_set[0:600]

whole_set=document_set

document_set=whole_set[0:1400]

all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
Exemple #45
0
def evaluate_classifier(featx, dataset, encod=""):
    if dataset=="movies":
      negids = movie_reviews.fileids('neg')
      posids = movie_reviews.fileids('pos')
      
      #print movie_reviews.raw(fileids=[negids[0]])
      negtexts = [preprocess(movie_reviews.raw(fileids=[f]),'text') for f in negids] 
      posfexts = [preprocess(movie_reviews.raw(fileids=[f]),'text') for f in posids]

      negfeats = [(featx(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
      posfeats = [(featx(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
    
      negfeats2 = [(preprocess(movie_reviews.raw(fileids=[f]),'dict'), 'neg') for f in negids]
      posfeats2 = [(preprocess(movie_reviews.raw(fileids=[f]),'dict'), 'pos') for f in posids]

      Nneg = len(negfeats)
      Npos = len(posfeats)
      negcutoff = Nneg*3/4
      poscutoff = Npos*3/4
    
      trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
      testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]

      trainfeats2 = negfeats2[:negcutoff] + posfeats2[:poscutoff]
      testfeats2 = negfeats2[negcutoff:] + posfeats2[poscutoff:]

      train_data = negtexts[:negcutoff] + posfexts[:poscutoff]
      train_targets = np.append(np.full_like(np.arange(negcutoff, dtype=np.int),0) , np.full_like(np.arange(poscutoff, dtype=np.int),1))
      test_data = negtexts[negcutoff:] + posfexts[poscutoff:]
      test_targets = np.append(np.full_like(np.arange(Nneg-negcutoff, dtype=np.int),0) , np.full_like(np.arange(Npos-poscutoff, dtype=np.int),1))

    elif dataset=="20newsgroups-5":   

      categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']
      twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)
      twenty_test = fetch_20newsgroups(subset='test',categories=categories, shuffle=True, random_state=42)      

      train_data = twenty_train.data
      train_targets = twenty_train.target
      test_data = twenty_test.data
      test_targets = twenty_test.target

      trainfeats = [(featx(preprocess(train_data[i],'words')), train_targets[i]) for i in range(len(train_data))]
      trainfeats2 = [(preprocess(train_data[i],'dict'), train_targets[i]) for i in range(len(train_data))]
      testfeats = [(featx(preprocess(test_data[i],'words')), test_targets[i]) for i in range(len(test_data))]
      testfeats2 = [(preprocess(test_data[i],'dict'), test_targets[i]) for i in range(len(test_data))]

    elif dataset=="20newsgroups":   

      twenty_train = fetch_20newsgroups(subset='train', shuffle=True, random_state=42)
      twenty_test = fetch_20newsgroups(subset='test', shuffle=True, random_state=42)      

      train_data = twenty_train.data
      train_targets = twenty_train.target
      test_data = twenty_test.data
      test_targets = twenty_test.target

      trainfeats = [(featx(preprocess(train_data[i],'words')), train_targets[i]) for i in range(len(train_data))]
      trainfeats2 = [(preprocess(train_data[i],'dict'), train_targets[i]) for i in range(len(train_data))]
      testfeats = [(featx(preprocess(test_data[i],'words')), test_targets[i]) for i in range(len(test_data))]
      testfeats2 = [(preprocess(test_data[i],'dict'), test_targets[i]) for i in range(len(test_data))]

    else:
      # Open a file
      path = dataset
      cat_dirs = os.listdir( path )
      
      print "Reading corpus from "+path  
      # This would print all the files and directories
      ncat = 0

      train_data = []
      train_targets = []
      test_data = []
      test_targets = []
  
      
      for category in cat_dirs:
        print "Reading category: "+category

        cat_files = os.listdir( path+"/"+category )

        temp_data = []
        temp_targets = []
        #encod = 'utf-8'

        for filename in cat_files:
          with io.open(path+"/"+category+"/"+filename, 'r', encoding=encod) as file:
            content = preprocess(file.read(),"text")
            temp_data.append(content)
         
          temp_targets.append(ncat)

        cutoff = len(temp_data)*3/4
        train_data = train_data + temp_data[:cutoff]
        train_targets = train_targets + temp_targets[:cutoff]
        test_data = test_data + temp_data[cutoff:]
        test_targets = test_targets + temp_targets[cutoff:]

        ncat+=1

      print "Finish reading corpus. "

      trainfeats = [(featx(preprocess(train_data[i],'words')), train_targets[i]) for i in range(len(train_data))]
      trainfeats2 = [(preprocess(train_data[i],'dict'), train_targets[i]) for i in range(len(train_data))]
      testfeats = [(featx(preprocess(test_data[i],'words')), test_targets[i]) for i in range(len(test_data))]
      testfeats2 = [(preprocess(test_data[i],'dict'), test_targets[i]) for i in range(len(test_data))]


      #sys.exit()  

    # scikit NB classifier
    print "Scikit classifier: "
    count_vect = CountVectorizer()
    X_train = count_vect.fit_transform(train_data)
    clf = MultinomialNB().fit(X_train, train_targets)     

    X_new = count_vect.transform(test_data)
    predicted = clf.predict(X_new)
    print 'Raw counts accuracy: ',np.mean(predicted == test_targets) 

    tf_transformer = TfidfTransformer(use_idf=False).fit(X_train)
    X_train_tf = tf_transformer.transform(X_train)
    clf = MultinomialNB().fit(X_train_tf, train_targets) 

    X_new = tf_transformer.transform(count_vect.transform(test_data))
    predicted = clf.predict(X_new)
    print 'TF accuracy: ',np.mean(predicted == test_targets) 

    tf_transformer = TfidfTransformer().fit(X_train)
    X_train_tf = tf_transformer.transform(X_train)
    clf = MultinomialNB().fit(X_train_tf, train_targets) 
    print clf.feature_log_prob_

    X_new = tf_transformer.transform(count_vect.transform(test_data))
    predicted = clf.predict(X_new)
    print 'Tfidf accuracy: ',np.mean(predicted == test_targets) 



    # NLTK classifier
    print "NLTK classifier: "
    classifier = NaiveBayesClassifier.train(trainfeats2)
    print 'Raw words accuracy:', nltk.classify.util.accuracy(classifier, testfeats2)

    classifier = NaiveBayesClassifier.train(trainfeats)
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)
 
    for i, (feats, label) in enumerate(testfeats):
            refsets[label].add(i)
            observed = classifier.classify(feats)
            testsets[observed].add(i)
 
    print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats)
    #print "--> "+str(classifier)+"\n"
    #print str(testfeats)
    print 'pos precision:', nltk.metrics.precision(refsets['pos'], testsets['pos'])
    print 'pos recall:', nltk.metrics.recall(refsets['pos'], testsets['pos'])
    print 'neg precision:', nltk.metrics.precision(refsets['neg'], testsets['neg'])
    print 'neg recall:', nltk.metrics.recall(refsets['neg'], testsets['neg'])
    classifier.show_most_informative_features()
Exemple #46
0
# <nbformat>3.0</nbformat>

# <codecell>

import nltk
from nltk.corpus import movie_reviews
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
import numpy as np
import random

# <codecell>

# Get a list of (document text, category)
documents = [
    (movie_reviews.raw(fileid), category)
    for category in movie_reviews.categories()
    for fileid in movie_reviews.fileids(category)
]
random.seed(3)
random.shuffle(documents)

# <codecell>

reviewtext, rating = documents[0]
print reviewtext
print rating

# <codecell>

train_samples, test_samples = documents[:1000], documents[1000:]
Exemple #47
0
# http://www-rohan.sdsu.edu/~gawron/python_for_ss/course_core/book_draft/text/text_classification.html

# We import Bo Pang and Lillian Lee’s movie reviews corpus [PANGLEE2004], which is one of the NLTK corpora.
from nltk.corpus import movie_reviews as mr

# Use a Naive Bayes Classifier
from nltk.classify import NaiveBayesClassifier

data = dict(pos = mr.fileids('pos'),
            neg = mr.fileids('neg'))
            
print mr.readme()

# The character by character view uses the raw method:
print mr.raw(data['pos'][0])[:100]

# The word by word character view uses the words method:
print mr.words(data['pos'][0])[:10]

def unigram_features (words):
   return dict((word, True) for word in words)

def extract_features (corpus, file_ids, cls, feature_extractor=unigram_features):

   return [(feature_extractor(corpus.words(i)), cls) for i in file_ids]

#### Training

# Use 90% of the data for training
neg_training = extract_features(mr, data['neg'][:900], 'neg',
from nltk.corpus import gutenberg, abc, reuters, brown, movie_reviews
from topia.termextract import extract
extractor = extract.TermExtractor()

with open('./corpus/all3.txt', 'r') as f:
	with open('./data/terms.txt', 'w') as o:
		o.write("Term\tOccurences\tStrength\n")
		for term in extractor(f.read()+gutenberg.raw()+abc.raw()+reuters.raw()+brown.raw()+movie_reviews.raw()):
			o.write("\t".join(map(str, term)) + "\n")
Exemple #49
0
def getNEG():
  return [normalize(movie_reviews.raw(fileids=file)) for file in movie_reviews.fileids("neg")[200:1000]] +  [normalize(movie_reviews.raw(fileids=file)) for file in movie_reviews.fileids("neg")[800:1000]]
Exemple #50
0
def evaluate(smoothingMethod, POSfreqs, NEGfreqs, POStest, NEGtest, name="", terse=False, validate=False):  
  """
Evaluate the accuracy of trained n-gram models (given by frequency lists and a smoothing method)
in distinguishing between positive and negative movie reviews.

Arguments:
  - smoothingMethod: a function f(w, freqs) that takes two arguments, a string <w> of length <n> and
      a frequencyList <freqs> as explained below, and returns the smoothed conditional probability
      of the n-gram <w>, i.e. Pr(w_n | w_1 .. w_{n-1}) -- or Pr(w[n-1]|w[0:n-1]) in Python indexing
  - POSfreqs: a list of dictionaries containing n-gram frequency counts for positive movie reviews,
      which can be used by smoothingMethod to compute conditional n-gram probabilities; it is
      customary to list unigram frequency counts first, followed by bigrams, etc.; however, any other
      data structure can be passed provided that it is accepted as a second argument by smoothingMethod
  - NEGfreqs: same as above, for negative movie reviews
  - POStest: list of positive reviews (characters strings) used to evaluate the trained n-gram models
  - NEGtest: same as above, for negative reviews
  - name: name of the smoothing/interpolation method (for evaluation report)
  - terse: if True, show compact evaluation report in single line
  - validate: test whether smoothingMethod produces a valid probability distribution without zeroes
"""

  # for the contest evaluation, we force validation and terse reporting
  validate = True
  terse = True
  
  # we also use a new "secret" test set, ignoring the parameters POStest and NEGtest
  # (this should have been held-out data, but by a silly mistake all movie reviews were included in the data set; so we use the first 200 reviews in each set, making sure they aren't part of the training data)
  POStest = [normalize(movie_reviews.raw(fileids=file)) for file in movie_reviews.fileids("pos")[0:200]]
  NEGtest = [normalize(movie_reviews.raw(fileids=file)) for file in movie_reviews.fileids("neg")[0:200]]

  if len(POSfreqs) != len(NEGfreqs):
    raise Exception("Both lists of n-gram frequency counts (POSfreqs / NEGfreqs) must have the same length!")
  if len(POStest) != len(NEGtest):
    raise Exception("Test set must contain same number of positive and negative reviews for a fair evaluation!")
  n = len(POSfreqs) - 1 # size of n-gram model, with POSfreqs = [corpus size, {unigrams}, {bigrams}, ...]
  
  n_guesses = 0
  guess = [0, 0, 0]   # n-gram classifier: none / pos / neg
  gold = [0, 0, 0]    # gold standard: none / pos / neg
  correct = [0, 0, 0] # whether classifier is correct: correct / wrong / no decision
  zeroProb = 0
  inconsistent = 0
  posCount = len(POStest)
  negCount = len(NEGtest)
    
  for category, test_set in [(1, POStest), (2, NEGtest)]:
    for review in test_set:
      review = normalize(review) # make sure that strings are properly normalized
      ngram = [' '] * n # first n-gram only consists of stop-characters
      POSlogp = 0       # calculate probabilities on logarithmic scale to avoid underflow
      NEGlogp = 0
      POSzero = False  # flag is set if any of the conditional n-gram probabilities is zero (cannot be represented on log scale)
      NEGzero = False
      for character in review:
        ngram.pop(0)
        ngram.append(character)

        ngram_string = ''.join(ngram) # convert ngram in fifo to string
        history = ngram_string[:-1]
        next_char = ngram_string[-1:]

        # call smoothingMethod to calculate conditional n-gram probability
        POSngramp = smoothingMethod(ngram_string, POSfreqs)
        NEGngramp = smoothingMethod(ngram_string, NEGfreqs)

        if POSngramp <= 0.0:
          POSzero = True
          if validate:
            print "Error: Pr(%s|%s) = 0 (positive model)" % (next_char, history)
        else:
          POSlogp += log(POSngramp)

        if NEGngramp <= 0.0:
          NEGzero = True
          if validate:
            print "Error: Pr(%s|%s) = 0 (negative model)" % (next_char, history)
        else:
          NEGlogp += log(NEGngramp)

        if validate:
          POScumprob = 0.0
          NEGcumprob = 0.0
          for c in alphabet:
            POScumprob += smoothingMethod(history + c, POSfreqs)
            NEGcumprob += smoothingMethod(history + c, NEGfreqs)
          if (abs(POScumprob - 1.0) > 1e-6):
            print "Error: Sum Pr(*|%s) = %f does not sum to 1 (positive model)" % (history, POScumprob)
            inconsistent += 1
          if (abs(NEGcumprob - 1.0) > 1e-6):
            print "Error: Sum Pr(*|%s) = %f does not sum to 1 (positive model)" % (history, NEGcumprob)
            inconsistent += 1
      
      # zero probability flagged: set log(Pr(w)) = -Inf for classifier, issue warning later
      if POSzero:
        zeroProb += 1
        POSlogp = -9e99  # practically -Inf
      if NEGzero:
        zeroProb += 1
        NEGlogp = -9e99

      # determine classifier decision and check whether it is correct
      if POSlogp > NEGlogp:
        classifier = 1 # decision: pos
      elif POSlogp < NEGlogp:
        classifier = 2 # decision: neg
      else:
        classifier = 0 # no decision (e.g. if both models have Pr(w) = 0)

      guess[classifier] += 1
      gold[category] += 1

      n_guesses += 1
      if classifier == 0:
        correct[2] += 1
      elif classifier == category:
        correct[0] += 1
      else:
        correct[1] += 1

  accuracy = float(correct[0]) / n_guesses * 100
  baseline = float(max(gold)) / n_guesses * 100
  if (gold[1] >= gold[2]):
    majority = "positive"
  else:
    majority = "negative"

  if terse:
    print 'accuracy:%6.2f%% (%3d/%3d/%3d) %4d positive,%4d negative  %s' % (accuracy, correct[0], correct[1], correct[2], guess[1], guess[2], name)
  else:
    print ' EVALUATION: %s' % (name)
    print '='*35
    print 
    print 'Correct        : %3d reviews' % correct[0]
    print 'Wrong          : %3d reviews' % correct[1]
    print 'Not classified : %3d reviews' % correct[2]
    print '-'*35
    print 'Accuracy       :%6.2f%%' % (accuracy)
    print 'Baseline       :%6.2f%%  (always classify as %s)' % (baseline, majority)
    print
    if zeroProb > 0:
      print 'Warning: probability = 0 estimated in %d cases -- need better smoothing!' % zeroProb
      print

  if validate and (zeroProb > 0 or inconsistent > 0):
    error_message = "N-Gram model (%s) failed validation: %d zero probabilities, %d inconsistencies" % (name, zeroProb, inconsistent)
    raise Exception(error_message)
Exemple #51
0
secure_app_port = 8443

eureka_url = "http://registry:1111/eureka/"

np.set_printoptions(precision=8)

# Read the data
classes = ['pos', 'neg']
train_data = []
train_labels = []
test_data = []
test_labels = []
for curr_class in classes:
    ids = movie_reviews.fileids(curr_class)
    for f in ids:
        train_data.append(movie_reviews.raw(f))
        train_labels.append(curr_class)

# Create feature vectors
vectorizer = TfidfVectorizer(min_df=5,
                             max_df=0.8,
                             sublinear_tf=True,
                             use_idf=True)
train_vectors = vectorizer.fit_transform(train_data)

# Perform classification with SVM, kernel=rbf
classifier_rbf = svm.SVC()
t0 = time.time()
classifier_rbf.fit(train_vectors, train_labels)
t1 = time.time()
time_rbf_train = t1 - t0
    for reference, predicted, text in zip(
                                          reference_labels,
                                          predicted_labels,
                                          reference_text
                                          ):
        if reference != predicted:
            fh.write("{0} {1}\n{2}\n\n".format(reference, predicted, text))

    fh.close()
    
if __name__ == '__main__':
    # You have to download the movie reviews first
    #nltk.download("movie_reviews")
    reviews = [
         (movie_reviews.raw(fid), list(movie_reviews.words(fid)), category) 
         for category in movie_reviews.categories() 
         for fid in movie_reviews.fileids(category)
         ]
    #print(reviews)
    # [(),...(),    ("text", ["w1", "w2", .....], "pos")]
    
    # Make sure we split the same way every time for the live coding
    random.seed(0)
    
    # Make sure to randomize the reviews first!
    random.shuffle(reviews)
    
    # Convert the data into feature vectors
    featuresets = [
        (features(review_text, review_words), label)