def setup():
    global bestwords

    word_fd = FreqDist()
    label_word_fd = ConditionalFreqDist()

    for word in movie_reviews.words(categories=['pos']):
        word_fd.inc(word.strip('\'"?,.').lower())
        label_word_fd['pos'].inc(word.lower())

    for word in movie_reviews.words(categories=['neg']):
        word_fd.inc(word.strip('\'"?,.').lower())
        label_word_fd['neg'].inc(word.lower())

    pos_word_count = label_word_fd['pos'].N()
    neg_word_count = label_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}

    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
            (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
            (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:10000]
    bestwords = set([w for w, s in best])
    return train(best_bigram_word_features)
def load_data():
   global posfeats,negfeats
   negids = movie_reviews.fileids('neg')
   posids = movie_reviews.fileids('pos')
   negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
   posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
   return
        def GetHighInformationWordsChi(num_bestwords):
            word_fd = FreqDist()
            label_word_fd = ConditionalFreqDist()
 
            for word in movie_reviews.words(categories=['pos']):
                word_fd[word.lower()] +=1
                label_word_fd['pos'][word.lower()] +=1
 
            for word in movie_reviews.words(categories=['neg']):
                word_fd[word.lower()] +=1
                label_word_fd['neg'][word.lower()] +=1
 
            pos_word_count = label_word_fd['pos'].N()
            neg_word_count = label_word_fd['neg'].N()
            total_word_count = pos_word_count + neg_word_count
 
            word_scores = {}
 
            for word, freq in word_fd.iteritems():
                pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
                    (freq, pos_word_count), total_word_count)
                neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
                    (freq, neg_word_count), total_word_count)
                word_scores[word] = pos_score + neg_score
 
            best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:num_bestwords]
            bestwords = set([w for w, s in best])
            return bestwords
Beispiel #4
0
 def maketrainset(movie_reviews, tokenizer, stemmer):
     negids = movie_reviews.fileids('neg')
     posids = movie_reviews.fileids('pos')
     negfeats = [(tokenizer(movie_reviews.words(fileids=[f]), stemmer), 'neg') for f in negids]
     posfeats = [(tokenizer(movie_reviews.words(fileids=[f]), stemmer), 'pos') for f in posids]
     trainfeats = negfeats + posfeats
     return trainfeats
Beispiel #5
0
def main():
    negids = movie_reviews.fileids('neg')
    posids = movie_reviews.fileids('pos')
    negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
    posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
    negcutoff = int(len(negfeats) * 3 / 4)
    poscutoff = int(len(posfeats) * 3 / 4)
    trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
    testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
    classifier = NaiveBayesClassifier.train(trainfeats)

    with open("output.json") as fin:
        sid = SentimentIntensityAnalyzer()
        data = json.load(fin)
    for key in data:
        reviews = data[key]["reviews"]
        for i in range(len(reviews)):
            text = reviews[i]["review"]
            sentiment_dict = {'positive_probability':0, 'label':'', 'negative_probability':0}
            prob = classifier.prob_classify(word_feats(text.split(" ")))
            classification = classifier.classify(word_feats(text.split(" ")))
            sentiment_dict['positive_probability'] = prob.prob('pos')
            sentiment_dict['negative_probability'] = prob.prob('neg')
            sentiment_dict['label'] = classification
            reviews[i]["sentiment"] = sentiment_dict
        data[key]["reviews"] = reviews
    with open('out_with_sentiment.json', 'w') as outfile:
        json.dump(data, outfile)
Beispiel #6
0
  def __init__(self):
    ## Best words feature extraction
    word_fd = FreqDist()
    label_word_fd = ConditionalFreqDist()
     
    for word in movie_reviews.words(categories=['pos']):
      word_fd.inc(word.lower())
      label_word_fd['pos'].inc(word.lower())
     
    for word in movie_reviews.words(categories=['neg']):
      word_fd.inc(word.lower())
      label_word_fd['neg'].inc(word.lower())

    pos_word_count = label_word_fd['pos'].N()
    neg_word_count = label_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count
     
    word_scores = {}
     
    for word, freq in word_fd.iteritems():
      pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
        (freq, pos_word_count), total_word_count)
      neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
        (freq, neg_word_count), total_word_count)
      word_scores[word] = pos_score + neg_score
     
    best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:10000]
    self.bestwords = set([w for w, s in best])
    self.train_classifier()
def prepareSentimentClassifier():

	documents = [(list(movie_reviews.words(fileid)), category)
		for category in movie_reviews.categories()
		for fileid in movie_reviews.fileids(category)]

	random.shuffle(documents)

	all_words = []
	for w in movie_reviews.words():
	    all_words.append(w.lower())

	all_words = nltk.FreqDist(all_words)
	
	global word_featuresSent
	word_featuresSent = list(all_words.keys())[:3000]

	featuresets = [(findFeaturesSentiment(rev), category) for (rev, category) in documents]
	
	training_set = featuresets[:1900]
	testing_set = featuresets[1900:]

	sentimentClassifier = nltk.NaiveBayesClassifier.train(training_set)

	print("Classifier accuracy percent:",(nltk.classify.accuracy(sentimentClassifier, testing_set))*100)

	return sentimentClassifier
Beispiel #8
0
 def best_word_feats(self, words):
     word_fd = FreqDist()
     label_word_fd = ConditionalFreqDist()
      
     for word in movie_reviews.words(categories=['pos']):
         word_fd.inc(word.lower())
         label_word_fd['pos'].inc(word.lower())
      
     for word in movie_reviews.words(categories=['neg']):
         word_fd.inc(word.lower())
         label_word_fd['neg'].inc(word.lower())
      
     # n_ii = label_word_fd[label][word]
     # n_ix = word_fd[word]
     # n_xi = label_word_fd[label].N()
     # n_xx = label_word_fd.N()
      
     pos_word_count = label_word_fd['pos'].N()
     neg_word_count = label_word_fd['neg'].N()
     total_word_count = pos_word_count + neg_word_count
      
     word_scores = {}
      
     for word, freq in word_fd.iteritems():
         pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
             (freq, pos_word_count), total_word_count)
         neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
             (freq, neg_word_count), total_word_count)
         word_scores[word] = pos_score + neg_score
      
     best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:10000]
     bestwords = set([w for w, s in best])
     return dict([(word, True) for word in words if word in bestwords])
Beispiel #9
0
    def train_with_movie_db(self):
        """
        Training possible with movie reviews
        - this does not yield particularly good results
        """
        self.use_movie_reviews = True

        negids = movie_reviews.fileids('neg')
        posids = movie_reviews.fileids('pos')

        negfeats = [(self.feature_extraction_movie_reviews(movie_reviews.words(fileids=[f])),
                     "negative") for f in negids]
        posfeats = [(self.feature_extraction_movie_reviews(movie_reviews.words(fileids=[f])),
                     "positive") for f in posids]

        negcutoff = len(negfeats) * 3 / 4
        poscutoff = len(posfeats) * 3 / 4

        trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
        testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]

        DLOG("train on %d instances, test on %d instances" % (len(trainfeats), len(testfeats)))

        self.classifier = NaiveBayesClassifier.train(trainfeats)

        DLOG("accuracy: " + str(util.accuracy(self.classifier, testfeats)))
        DLOG(self.classifier.show_most_informative_features())
Beispiel #10
0
def train(test=False):

    negids = movie_reviews.fileids('neg')
    posids = movie_reviews.fileids('pos')


    negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
    posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]


    if(test):
        negcutoff = len(negfeats)*3/4
        poscutoff = len(posfeats)*3/4

        trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
        testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]

        print 'train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats))

        classifier = NaiveBayesClassifier.train(trainfeats)
        print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats)

        classifier.show_most_informative_features()

    else:
        return NaiveBayesClassifier.train(negfeats+posfeats)
def documentClassification():

    from nltk.corpus import movie_reviews

    documents = [(list(movie_reviews.words(fileid)), category)
            for category in movie_reviews.categories()
            for fileid in movie_reviews.fileids(category)]

    random.shuffle(documents)

    all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
    word_features = all_words.keys()[:2000]

    def document_features(document):
        document_words = set(document)
        features = {}
        for word in word_features:
            features['contains(%s)' % word] = (word in document_words)
        return features

    print document_features(movie_reviews.words('pos/cv957_8737.txt')) 

    featuresets = [(document_features(d), c) for (d,c) in documents]
    train_set, test_set = featuresets[100:], featuresets[:100]
    classifier = nltk.NaiveBayesClassifier.train(train_set)

    print nltk.classify.accuracy(classifier, test_set)
    classifier.show_most_informative_features(5)
Beispiel #12
0
def category_by_movie():
    from nltk.corpus import movie_reviews as mr
    from nltk import FreqDist
    from nltk import NaiveBayesClassifier
    from nltk import classify
    from nltk.corpus import names
    from nltk.classify import apply_features
    import random

    documents = [(list(mr.words(f)), c) for c in mr.categories() for f in
mr.fileids(c)]
    random.shuffle(documents)

    all_words = FreqDist(w.lower() for w in mr.words())
    word_features = all_words.keys()[:2000]

    def document_features(document):
        document_words = set(document)
        features = {}
        for word in word_features:
            features['contains(%s)' % word] = (word in document_words)
        return features

    #print document_features(mr.words('pos/cv957_8737.txt'))
    #print documents[0]

    features = [(document_features(d), c) for (d, c) in documents]
    train_set, test_set = features[100:], features[:100]
    classifier = NaiveBayesClassifier.train(train_set)
    print classify.accuracy(classifier, train_set)
def evaluate_classifier(featx):
    negids = movie_reviews.fileids('neg')
    posids = movie_reviews.fileids('pos')
 
    negfeats = [(featx(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
    posfeats = [(featx(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
 
    negcutoff = len(negfeats)*3/4
    poscutoff = len(posfeats)*3/4
 
    trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
    testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
 
    classifier = NaiveBayesClassifier.train(trainfeats)
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)
 
    for i, (feats, label) in enumerate(testfeats):
            refsets[label].add(i)
            observed = classifier.classify(feats)
            testsets[observed].add(i)
 
    print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats)
    print 'pos precision:', nltk.metrics.precision(refsets['pos'], testsets['pos'])
    print 'pos recall:', nltk.metrics.recall(refsets['pos'], testsets['pos'])
    print 'neg precision:', nltk.metrics.precision(refsets['neg'], testsets['neg'])
    print 'neg recall:', nltk.metrics.recall(refsets['neg'], testsets['neg'])
    classifier.show_most_informative_features()
Beispiel #14
0
def main(argv):
    negids = movie_reviews.fileids('neg')
    posids = movie_reviews.fileids('pos')

    #print negids
 
    negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'negative') for f in negids]
    posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'positive') for f in posids]

    trainfeats =  posfeats+negfeats
    #print trainfeats
    #    break
    classifier = NaiveBayesClassifier.train(trainfeats)

    #classifier = pickle.load(open("classifier.p", "rb"))
    topicList = ["media", "sports", "news", "fashion", "finance", "politics"]
    for line in sys.stdin:
        try:
            tolk_posset = word_tokenize(line.rstrip())
            d = word_feats(tolk_posset)
            for topic in topicList:
                subjectFull = subj(line, topic)
                if not subjectFull == "No match":
                    #print d
                    print "LongValueSum:" + "" + str(line.split(":")[0])+","+subjectFull + "," + classifier.classify(d) + "\t" + "1"                    
        except:
                #print "Error"
                continue
Beispiel #15
0
    def __init__(self):
        self.documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]
        random.shuffle(self.documents)

        all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
        word_features = all_words.keys()[:2000]
Beispiel #16
0
 def __init__(self, train1=True, train2=True, train3=True, train4=True):
     self.trainfeats = []        
     
     if train1:
         negids = movie_reviews.fileids('neg')
         posids = movie_reviews.fileids('pos')
          
         neg_movies = [(self.word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
         pos_movies = [(self.word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
         
         self.trainfeats = neg_movies + pos_movies
     
     if train2:
         f = open("out.txt", "r")
         
         negfeats = []
         posfeats = []
         for line in f:
             status = line[0]
             texto = line[2:]
 
             if status == '0':
                 negfeats.append((self.word_feats(texto.split(" ")), 'neg'))
             elif status == '1':
                 posfeats.append((self.word_feats(texto.split(" ")), 'pos'))               
     
         self.trainfeats += negfeats + posfeats
     
     if train3:    
         f = open("E:\\Workspace\\WS_TG\\analisador1\\AFINN\\AFINN-111.txt", 'r')
         for l in f:
             data = l.strip().split('\t')
             self.trainfeats.append( (self.word_feats(data[0]), 'neg' if int(data[1]) < 0 else 'pos'))
             
     if train4:
         f = open("E:\\Workspace\\WS_TG\\api\\trainning set.txt", 'r')
         pos = []
         neutral = []
         neg = []
         for line in f:
             if line.startswith("pos"):
                 pos.append(line)
             elif line.startswith("neutral"):
                 neutral.append(line)
             elif line.startswith("neg"):
                 neg.append(line)
                 
         print len(pos), len(neutral), len(neg)
         
         total = pos + neutral[:200] + neg
         
         for line in total:
             data = line.split(' .:. ')
             self.trainfeats.append( (self.word_feats(data[1].split()), data[0]) )
                    
     self.classifier = NaiveBayesClassifier.train(self.trainfeats)
     
     print self.classifier.show_most_informative_features(20)
 def build_classifier(self):
     documents = [(' '.join(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)]
     random.shuffle(documents)
     
     all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words() if len(w) > 2)
     self.word_features = all_words.keys()[:2000]
     
     featuresets = [(self.document_features(d), c) for (d,c) in documents]
     classifier = nltk.NaiveBayesClassifier.train(featuresets)
     return classifier
Beispiel #18
0
 def __init__(self, load = False, loadFile = ""):
     if(load):
         self.loadClassifier(loadFile)
     else:
         negids = movie_reviews.fileids('neg')
         posids = movie_reviews.fileids('pos')
         negfeats = [(self.word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in movie_reviews.fileids('neg')]
         posfeats = [(self.word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in movie_reviews.fileids('pos')]
         trainfeats = negfeats + posfeats
         self.classifier = NaiveBayesClassifier.train(trainfeats)
def train(feature):
    negids = movie_reviews.fileids('neg')
    posids = movie_reviews.fileids('pos')

    negfeatures = [(feature(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
    posfeatures = [(feature(movie_reviews.words(fileids=[f])), 'pos') for f in posids]

    trainfeatures = negfeatures + posfeatures
    classifier = NaiveBayesClassifier.train(trainfeatures)

    return classifier
    def train_classifiers(self):
        negids = movie_reviews.fileids('neg')
        posids = movie_reviews.fileids('pos')
        negfeats = [(word_feats(
            movie_reviews.words(fileids=[f])), 'neg') for f in negids]
        posfeats = [(word_feats(
            movie_reviews.words(fileids=[f])), 'pos') for f in posids]
        trainfeats = negfeats + posfeats

        # train naive bayes
        self.classifier = NaiveBayesClassifier.train(trainfeats)
Beispiel #21
0
def classify_document():
    from nltk.corpus import movie_reviews
    import random

    documents = [(list(movie_reviews.words(fileid)), category)
                 for category in movie_reviews.categories()
                 for fileid in movie_reviews.fileids(category)]
    random.shuffle(documents)
    all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
    allwords = [w for w, _ in all_words.most_common(2000)]
    featuresets = [(document_features(d, allwords), c) for d, c in documents]
    return classify(nltk.NaiveBayesClassifier, featuresets, 0.1)
Beispiel #22
0
  def train(self, feats):
    print "Starting to train the data"
    start = datetime.datetime.now()

    print "setting the ids", datetime.datetime.now()
    self.negids = movie_reviews.fileids('neg')
    self.posids = movie_reviews.fileids('pos')
    #random.shuffle(self.negids)
    #random.shuffle(self.posids)
    ##self.reviews = ([(movie_reviews.words(fileids=[f]), 'neg') for f in self.negids] +
        ##[(movie_reviews.words(fileids=[f]), 'pos') for f in self.posids])
    ##random.shuffle(self.reviews)

    ##self.train_set = apply_features(feats, self.reviews[len(self.reviews)*1/4:])
    ##self.test_set = apply_features(feats, self.reviews[:len(self.reviews)*1/4])

    print "setting the feats", datetime.datetime.now()
    self.negfeats = [(feats(movie_reviews.words(fileids=[f])), 'neg') for f in self.negids]
    self.posfeats = [(feats(movie_reviews.words(fileids=[f])), 'pos') for f in self.posids]

    self.negcutoff = len(self.negfeats)*3/4
    self.poscutoff = len(self.posfeats)*3/4

    print "setting the train/test", datetime.datetime.now()
    self.trainfeats = self.negfeats[:self.negcutoff] + self.posfeats[:self.poscutoff]
    self.testfeats = self.negfeats[self.negcutoff:] + self.posfeats[self.poscutoff:]

    print "training", datetime.datetime.now()
    self.classifier = NaiveBayesClassifier.train(self.trainfeats)
    ##self.classifier = NaiveBayesClassifier.train(self.train_set)
    self.refsets = defaultdict(set)
    self.testsets = defaultdict(set)

    print "accuracy stuff", datetime.datetime.now()
    for i, (feats, label) in enumerate(self.testfeats):
    ##for i, (feats, label) in enumerate(self.test_set):
      self.refsets[label].add(i)
      observed = self.classifier.classify(feats)
      self.testsets[observed].add(i)

    end = datetime.datetime.now()
    print "Training lasted for ", end-start


    print 'accuracy:', nltk.classify.util.accuracy(self.classifier, self.testfeats)
    ##print 'accuracy:', nltk.classify.util.accuracy(self.classifier, self.test_set)
    print 'pos precision:', nltk.metrics.precision(self.refsets['pos'], self.testsets['pos'])
    print 'pos recall:', nltk.metrics.recall(self.refsets['pos'], self.testsets['pos'])
    print 'neg precision:', nltk.metrics.precision(self.refsets['neg'], self.testsets['neg'])
    print 'neg recall:', nltk.metrics.recall(self.refsets['neg'], self.testsets['neg'])
    self.classifier.show_most_informative_features()
    self.trained = True
Beispiel #23
0
def demo_movie_reviews(trainer, n_instances=None, output=None):
    """
    Train classifier on all instances of the Movie Reviews dataset.
    The corpus has been preprocessed using the default sentence tokenizer and
    WordPunctTokenizer.
    Features are composed of:
        - most frequent unigrams

    :param trainer: `train` method of a classifier.
    :param n_instances: the number of total reviews that have to be used for
        training and testing. Reviews will be equally split between positive and
        negative.
    :param output: the output file where results have to be reported.
    """
    from nltk.corpus import movie_reviews
    from nltk.sentiment import SentimentAnalyzer

    if n_instances is not None:
        n_instances = int(n_instances/2)

    pos_docs = [(list(movie_reviews.words(pos_id)), 'pos') for pos_id in movie_reviews.fileids('pos')[:n_instances]]
    neg_docs = [(list(movie_reviews.words(neg_id)), 'neg') for neg_id in movie_reviews.fileids('neg')[:n_instances]]
    # We separately split positive and negative instances to keep a balanced
    # uniform class distribution in both train and test sets.
    train_pos_docs, test_pos_docs = split_train_test(pos_docs)
    train_neg_docs, test_neg_docs = split_train_test(neg_docs)

    training_docs = train_pos_docs+train_neg_docs
    testing_docs = test_pos_docs+test_neg_docs

    sentim_analyzer = SentimentAnalyzer()
    all_words = sentim_analyzer.all_words(training_docs)

    # Add simple unigram word features
    unigram_feats = sentim_analyzer.unigram_word_feats(all_words, min_freq=4)
    sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)
    # Apply features to obtain a feature-value representation of our datasets
    training_set = sentim_analyzer.apply_features(training_docs)
    test_set = sentim_analyzer.apply_features(testing_docs)

    classifier = sentim_analyzer.train(trainer, training_set)
    try:
        classifier.show_most_informative_features()
    except AttributeError:
        print('Your classifier does not provide a show_most_informative_features() method.')
    results = sentim_analyzer.evaluate(test_set)

    if output:
        extr = [f.__name__ for f in sentim_analyzer.feat_extractors]
        output_markdown(output, Dataset='Movie_reviews', Classifier=type(classifier).__name__,
                        Tokenizer='WordPunctTokenizer', Feats=extr, Results=results,
                        Instances=n_instances)
Beispiel #24
0
def evaluate_classifier(featx):
    negids = movie_reviews.fileids('neg')
    posids = movie_reviews.fileids('pos')
    negfeats = [(featx(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
    posfeats = [(featx(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
 
    negcutoff = len(negfeats)*3/4
    poscutoff = len(posfeats)*3/4
 
    trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
    testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
    classifier = NaiveBayesClassifier.train(trainfeats)
    return classifier
Beispiel #25
0
def train():
    global classifier

    # Train our classifier
    negids = movie_reviews.fileids('neg')
    posids = movie_reviews.fileids('pos')

    negfeats = [(feature_extractor(movie_reviews.words(fileids=[f])), 'neg')
                for f in negids]
    posfeats = [(feature_extractor(movie_reviews.words(fileids=[f])), 'pos')
                for f in posids]

    classifier = NaiveBayesClassifier.train(negfeats + posfeats)
    def train_classifier(self):
        """This code is heavily inspired by:
        http://streamhacker.com/2010/05/10/text-classification-sentiment-analysis-naive-bayes-classifier/
        """
        negids = movie_reviews.fileids('neg')
        posids = movie_reviews.fileids('pos')

        negfeats = [(self.word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
        posfeats = [(self.word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]

        trainfeats = negfeats + posfeats

        self.classifier = NaiveBayesClassifier.train(trainfeats)
def main():
	negids = movie_reviews.fileids('neg')
	posids = movie_reviews.fileids('pos')

	to_review1 = "A man with a magnanimous spirit helps a mute girl from Pakistan return home."
	to_review2 = "Forced out of his own company by former Darren Cross, Dr. Hank Pym (Michael Douglas) recruits the talents of Scott Lang (Paul Rudd), a master thief just released from prison. Lang becomes Ant-Man, trained by Pym and armed with a suit that allows him to shrink in size, possess superhuman strength and control an army of ants. The miniature hero must use his new skills to prevent Cross, also known as Yellowjacket, from perfecting the same technology and using it as a weapon for evil."
	to_review3 = '''Parents need to know that kids may clamor to see this fast-paced, action-packed comic book-based adventure. But it's definitely more age-appropriate for teens than younger children. Although much of the violence is clearly meant to be based in the realm of sci-fi and fantasy -- and/or is shown at a distance -- there's plenty of it, from massive explosions to children held at gunpoint to super-powered fistfights. Some of the violence is war themed, and some characters get hurt and/or die. While much is made of lead character Tony Stark's devil-may-care lifestyle of fun and frolic, viewers also see him turn away from the more irresponsible aspects of playboyhood. Language is minimal, and sexual content is more suggested than shown overall -- though there are a few eyebrow-raising moments.'''
	reviews = []
	reviews.append(to_review1)
	reviews.append(to_review2)
	reviews.append(to_review3)

	for to_review in reviews:
		to_review_words = to_review.split(" ")
		print "Reviewing",to_review,"\n\n\n"


		print ''' Normal classification ''',"\n\n"
		negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
		posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
		calculateScore(classification(negfeats, posfeats, 1, 1), to_review_words)
		calculateScore(classification(negfeats, posfeats, 1, 0.95), to_review_words)
		calculateScore(classification(negfeats, posfeats, 0.95, 1), to_review_words)
		calculateScore(classification(negfeats, posfeats, 0.9, 1), to_review_words)
		calculateScore(classification(negfeats, posfeats, 1, 0.9), to_review_words)

		print ''' Without Punctuations ''',"\n\n"
		negfeats_stopwords = [(word_feats_punctuations(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
		posfeats_stopwords = [(word_feats_punctuations(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
		calculateScore_punctuations(classification(negfeats, posfeats, 1, 1), to_review_words)
		calculateScore_punctuations(classification(negfeats, posfeats, 1, 0.95), to_review_words)
		calculateScore_punctuations(classification(negfeats, posfeats, 0.95, 1), to_review_words)
		calculateScore_punctuations(classification(negfeats, posfeats, 0.9, 1), to_review_words)
		calculateScore_punctuations(classification(negfeats, posfeats, 1, 0.9), to_review_words)



		print ''' Without Stop Words ''',"\n\n"
		negfeats_stopwords = [(word_feats_stopwords(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
		posfeats_stopwords = [(word_feats_stopwords(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
		wordstoreview = []
		for each in to_review_words:
			if each not in stopwords.words('english'):
				wordstoreview.append(each)
		calculateScore_stopwords(classification(negfeats, posfeats, 1, 1), wordstoreview)
		calculateScore_stopwords(classification(negfeats, posfeats, 1, 0.95), to_review_words)
		calculateScore_stopwords(classification(negfeats, posfeats, 0.95, 1), to_review_words)
		calculateScore_stopwords(classification(negfeats, posfeats, 0.9, 1), to_review_words)
		calculateScore_stopwords(classification(negfeats, posfeats, 1, 0.9), to_review_words)


		print ''' With Lemmatizer ''',"\n\n"
		negfeats_stopwords = [(word_feats_lemmatize(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
		posfeats_stopwords = [(word_feats_lemmatize(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
		calculateScore_lemmatizer(classification(negfeats, posfeats, 1, 1), to_review_words)
		calculateScore_lemmatizer(classification(negfeats, posfeats, 1, 0.95), to_review_words)
		calculateScore_lemmatizer(classification(negfeats, posfeats, 0.95, 1), to_review_words)
		calculateScore_lemmatizer(classification(negfeats, posfeats, 0.9, 1), to_review_words)
		calculateScore_lemmatizer(classification(negfeats, posfeats, 1, 0.9), to_review_words)
Beispiel #28
0
def evaluate_classifier(featx):
    negids = movie_reviews.fileids('neg')
    posids = movie_reviews.fileids('pos')

    count = 1500000
    lines = []

    english_stops = set(stopwords.words('english'))
    print ctime(), "Reading files..."

    f = open('Sentiment Analysis Dataset.csv', "rU")

    line = f.readline()
    line = f.readline()

    negfeats = []
    posfeats = []

    for i in range(count):
        lines.append(line)
        line = f.readline()
    f.close()
    random.shuffle(lines)

    
    negfeats = [(featx(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
    posfeats = [(featx(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
 
    negcutoff = len(negfeats)*3/4
    poscutoff = len(posfeats)*3/4
 
    trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
    testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
 
    classifier = NaiveBayesClassifier.train(trainfeats)
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)
 
    for i, (feats, label) in enumerate(testfeats):
            refsets[label].add(i)
            observed = classifier.classify(feats)
            testsets[observed].add(i)
 
    print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats)
    print 'pos precision:', nltk.metrics.precision(refsets['pos'], testsets['pos'])
    print 'pos recall:', nltk.metrics.recall(refsets['pos'], testsets['pos'])
    print 'neg precision:', nltk.metrics.precision(refsets['neg'], testsets['neg'])
    print 'neg recall:', nltk.metrics.recall(refsets['neg'], testsets['neg'])
    classifier.show_most_informative_features()
Beispiel #29
0
  def train_classifier(self):
    # Training
    negids = movie_reviews.fileids('neg')
    posids = movie_reviews.fileids('pos')

    negfeats = [(self.best_word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
    posfeats = [(self.best_word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]

    negcutoff = len(negfeats)*3/4
    poscutoff = len(posfeats)*3/4

    trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
    testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]

    self.sentiment_classifier = NaiveBayesClassifier.train(trainfeats)
def trainMovies():
    negids = movie_reviews.fileids('neg')
    print type(negids), negids
    posids = movie_reviews.fileids('pos')

    negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
    posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]

    train = negfeats + posfeats

    classifier = NaiveBayesClassifier.train(train)

    f = open('movie_semtiment_classifier.pickle', 'wb')
    pickle.dump(classifier, f)
    f.close()
Beispiel #31
0
def train():
   print('train')
   test_list = []
   logPrior = []
   likelihoods = {}
   big_doc = []
   N_docs = len(movie_reviews.fileids())
   #vocab = []
   #for fileid in movie_reviews.fileids():
      #  vocab.append(set(w.lower() for w in movie_reviews.words(fileids)))
   classes = movie_reviews.categories()
   vocab = []
   text_c = ''
   c_id=0
   for c in classes:
      likelihoods[c_id] = {}
      class_list = movie_reviews.fileids(c)
      N_c = len(class_list)
      logPrior.append(math.log(N_c/N_docs))

      #get test and train list
      tenPercent = int(len(class_list)*.1)
      lastPercent = int(len(class_list)*.9)
      testList = class_list[:tenPercent]
      class_List = class_list[-lastPercent:]
      test_list.append(testList)
      
      #put all c docs into big doc [c]
      
      
      for title in class_list:
         review = movie_reviews.words(title)  #list of words in a review
         review = ' '.join(review)  #make into string to tokenize
         tok = word_tokenize(review) #tokenize returns a list

         remove = string.digits + string.punctuation
         table = str.maketrans('','',remove)
         filtered = [w.translate(table) for w in tok]

         stop_words = set(stopwords.words('english')) #common small words
         filtered = [w for w in filtered if w not in stop_words] #stop words removed

         filtered = [w for w in filtered if len(w)>0]

         text = ' '.join(filtered)
         #get vocab
         for w in filtered:
            vocab.append(w)

         #print("text " + text)
         text_c = text_c + text + " "
         #print(text_c)

   
      vocab = list(dict.fromkeys(vocab))
      big_doc.append(text_c)
      c_id = c_id + 1


   #restart the loop, because needed vocab from both classes
   c_id = 0
   for c in classes:
      print(c)
      count_w_v_c = 0
      for w in vocab:
         count_w_v_c = big_doc[c_id].count(w) + 1 + count_w_v_c
      for w in vocab:
         count_w_c = big_doc[c_id].count(w) + 1
         likelihoods[c_id][w] = math.log(count_w_c/count_w_v_c)
      c_id = c_id + 1  

   return logPrior, likelihoods, vocab, test_list
Beispiel #32
0
from nltk.corpus import movie_reviews
import nltk
import random
#prepare data set with labels
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

random.shuffle(documents)
all_words = nltk.FreqDist(
    w.lower() for w in movie_reviews.words())  #words,FreqDist() 方法获取到每个单词的出现次数
word_features = list(all_words)[:2000]


#checks whether each of these words is present in a given document.
def document_features(document):
    #The reason that we compute the set of all words in a document in [3], rather than just checking if word in document, is that checking whether a word occurs in a set is much faster than checking whether it occurs in a list (4.7).
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features


#print(document_features(movie_reviews.words('pos/cv957_8737.txt')))
featuresets = [(document_features(d), c) for (d, c) in documents]
train_set = featuresets[100:]
test_set = featuresets[:100]
classfier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classfier, test_set))
Beispiel #33
0
        if token in positiveWords:
            countPos += 1
        if token in negativeWords:
            countNeg += 1

    if countPos >= countNeg:
        features['guess'] = "positive"
    elif countNeg > countPos:
        features['guess'] = "negative"
    return features


# prepare review data as a list of tuples:
# (list of tokens, category)
# category is positive / negative
review_data = [(movie_reviews.words(fileid), category)
               for category in movie_reviews.categories()
               for fileid in movie_reviews.fileids(category)]

threshold = 10000  # 10000 appears to be the best threshold

fd_all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
top_words = [word for (word, freq) in fd_all_words.most_common(threshold)]

review_data_fdist = [(nltk.FreqDist(token.lower() for token in words
                                    if token in top_words), category)
                     for words, category in review_data]
# Shuffle data randomly
random.seed(42)
random.shuffle(review_data_fdist)
# Split in training (80 percent) and test set (20 percent)
Beispiel #34
0
         ('This is an amazing place!', 'pos'),
         ('I feel very good about these beers.', 'pos'),
         ('This is my best work.', 'pos'), ("What an awesome view", 'pos'),
         ('I do not like this restaurant', 'neg'),
         ('I am tired of this stuff.', 'neg'),
         ("I can't deal with this", 'neg'), ('He is my sworn enemy!', 'neg'),
         ('My boss is horrible.', 'neg')]
test = [('The beer was good.', 'pos'), ('I do not enjoy my job', 'neg'),
        ("I ain't feeling dandy today.", 'neg'), ("I feel amazing!", 'pos'),
        ('Gary is a friend of mine.', 'pos'),
        ("I can't believe I'm doing this.", 'neg')]

cl = DecisionTreeClassifier(train)

# Grab some movie review data
reviews = [(list(movie_reviews.words(fileid)), category)
           for category in movie_reviews.categories()
           for fileid in movie_reviews.fileids(category)]
random.shuffle(reviews)
new_train, new_test = reviews[0:100], reviews[100:150]

# Update the classifier with the new training data
cl.update(new_train)

# Compute accuracy
accuracy = cl.accuracy(test + new_test)
print("Accuracy: {0}".format(accuracy))

# Show 5 most informative features
#cl.show_informative_features(5)
import itertools
import pandas as pd
import numpy as np
import string

sw = set(stopwords.words('english'))
punctuation = set(string.punctuation)
all_names = set([name.lower() for name in names.words()])


def isStopWord(word):
    return (word in sw
            or word in punctuation) or not word.isalpha() or word in all_names


review_words = movie_reviews.words()
filtered = [w.lower() for w in review_words if not isStopWord(w.lower())]

words = FreqDist(filtered)

texts = []

for fid in movie_reviews.fileids():
    texts.append(" ".join([
        w.lower() for w in movie_reviews.words(fid)
        if not isStopWord(w.lower()) and words[w.lower()] > 1
    ]))

vectorizer = TfidfVectorizer(stop_words='english')
matrix = vectorizer.fit_transform(texts)
sums = np.array(matrix.sum(axis=0)).ravel()
Beispiel #36
0
#Read https://stackoverflow.com/questions/10059594/a-simple-explanation-of-naive-bayes-classification for how NaiveBayesClassifier work


def naive_bayes_input(words):
    useful_words = [
        word for word in words if word not in stopwords.words('english')
    ]
    words_dict = dict([(word, True) for word in useful_words])
    return words_dict  #This is how NaiveBayesClassifier expects input


#The sentiment analysis code is just a machine learning algorithm that has been trained to identify positive/negative reviews.

negative_reviews = []
for file in movie_reviews.fileids('neg'):
    words = movie_reviews.words(file)
    negative_reviews.append((naive_bayes_input(words), 'negative'))

positive_reviews = []
for file in movie_reviews.fileids('pos'):
    words = movie_reviews.words(file)
    positive_reviews.append((naive_bayes_input(words), 'positive'))

# print(len(negative_reviews), len(positive_reviews))

train_set = negative_reviews[:800] + positive_reviews[:800]
test_set = negative_reviews[800:] + positive_reviews[800:]

classifier = NaiveBayesClassifier.train(train_set)

accuracy = nltk.classify.util.accuracy(classifier, test_set)
Beispiel #37
0
# define a 80/20 split for train/test
SPLIT = 0.8

# file IDs for the positive and negative reviews
posids = movie_reviews.fileids('pos')
negids = movie_reviews.fileids('neg')


def word_feats(words):
    feats = defaultdict(lambda: False)
    for word in words:
        feats[word] = True
    return feats


posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos')
            for f in posids]
negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg')
            for f in negids]

cutoff = int(len(posfeats) * SPLIT)

trainfeats = negfeats[:cutoff] + posfeats[:cutoff]
testfeats = negfeats[cutoff:] + posfeats[cutoff:]

print 'Train on %d instances' % len(trainfeats)
print 'Test on %d instances' % len(testfeats)

classifier = NaiveBayesClassifier.train(trainfeats)
print 'Accuracy:', nltk.classify.util.accuracy(classifier, testfeats)
from nltk.corpus import movie_reviews
import nltk
import random
import pickle

document = []
for category in movie_reviews.categories():
	for fileid in movie_reviews.fileids(category):
		document.append((movie_reviews.words(fileid),category))

random.shuffle(document)

all_words = []
for w in movie_reviews.words():
	all_words.append(w.lower())

all_words = nltk.FreqDist(all_words)
word_features = list(all_words.keys())[:3000]
# print(word_features)

# document = ["words","category","words","category",..]
# word_features = [top 3000 words]

def find_features(document):
	words = set(document)
	features = {}
	for w in word_features:
		features[w] = (w in words)

	return features
import nltk
import random
from nltk.corpus import movie_reviews
import pickle
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.svm import SVC, NuSVC

documents = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((list(movie_reviews.words(fileid)), category))

random.shuffle(documents)

all_words = []
for w in movie_reviews.words():
    all_words.append(w.lower())

all_words = nltk.FreqDist(all_words)

word_features = list(all_words.keys())[:3000]


def find_features(document):
    words = set(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)
    return features
Beispiel #40
0
import nltk
import random
from nltk.corpus import movie_reviews

corpus_list = []

corpus_list = [(list(movie_reviews.words(fileid)), category)
               for category in movie_reviews.categories()
               for fileid in movie_reviews.fileids(category)]

random.shuffle(corpus_list)

entire_words_list = []
for word in movie_reviews.words():
    entire_words_list.append(word.lower())

entire_words_list = nltk.FreqDist(entire_words_list)
features_wd = list(entire_words_list.keys())[:4000]


def locate_word_features(corpus):
    words_text = set(corpus)
    feature_set = {}
    for word in features_wd:
        feature_set[word] = (word in words)
    return feature_set


print((locate_word_features(movie_reviews.words('pos/cv000_29590.txt'))))

features = [(locate_word_features(reveue), category)
def we_represent(tokens):
    vec = numpy.zeros(300)
    for tok in tokens:
        if tok.lower() in w2v:
            vec += w2v[tok]
    return vec

training_instances = []
training_labels = []
test_instances = []
test_labels = []

for label in movie_reviews.categories():
    for fileid in movie_reviews.fileids(label):
        doc = movie_reviews.words(fileid)
        instance = we_represent(doc)
        if label == 'pos':
            lbl = 1
        else:
            lbl = 0
        if random.randint(0, 9) == 0:
            test_instances.append(instance)
            test_labels.append(lbl)
        else:
            training_instances.append(instance)
            training_labels.append(lbl)

print(training_instances)
print(training_labels)
print(test_instances)
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        return mode(votes)

    def confidence(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)

        choice_votes = votes.count(mode(votes))
        conf = choice_votes / len(votes)
        return conf

documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

random.shuffle(documents)

all_words = []

for w in movie_reviews.words():
    all_words.append(w.lower())

all_words = nltk.FreqDist(all_words)

word_features = list(all_words.keys())[:3000]

def find_features(document):
# Text Classification

# Import libraries
import nltk
import random
from nltk.corpus import movie_reviews

# Create a list of tuples
documents = []

for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((list(movie_reviews.words(fileid)), category))

# Shuffle the documents
random.shuffle(documents)
print(documents[0])

# Normalize the dataset
all_words = []
for w in movie_reviews.words():
    all_words.append(w.lower())

# NLTK frequency distribution
all_words = nltk.FreqDist(all_words)
print(all_words.most_common(15))
print(all_words['love'])

# Limit the words
word_featuers = list(all_words.keys())[:3000]
Beispiel #44
0
def demo_movie_reviews(trainer, n_instances=None, output=None):
    """
    Train classifier on all instances of the Movie Reviews dataset.
    The corpus has been preprocessed using the default sentence tokenizer and
    WordPunctTokenizer.
    Features are composed of:
        - most frequent unigrams

    :param trainer: `train` method of a classifier.
    :param n_instances: the number of total reviews that have to be used for
        training and testing. Reviews will be equally split between positive and
        negative.
    :param output: the output file where results have to be reported.
    """
    from nltk.corpus import movie_reviews
    from nltk.sentiment import SentimentAnalyzer

    if n_instances is not None:
        n_instances = int(n_instances / 2)

    pos_docs = [(list(movie_reviews.words(pos_id)), "pos")
                for pos_id in movie_reviews.fileids("pos")[:n_instances]]
    neg_docs = [(list(movie_reviews.words(neg_id)), "neg")
                for neg_id in movie_reviews.fileids("neg")[:n_instances]]
    # We separately split positive and negative instances to keep a balanced
    # uniform class distribution in both train and test sets.
    train_pos_docs, test_pos_docs = split_train_test(pos_docs)
    train_neg_docs, test_neg_docs = split_train_test(neg_docs)

    training_docs = train_pos_docs + train_neg_docs
    testing_docs = test_pos_docs + test_neg_docs

    sentim_analyzer = SentimentAnalyzer()
    all_words = sentim_analyzer.all_words(training_docs)

    # Add simple unigram word features
    unigram_feats = sentim_analyzer.unigram_word_feats(all_words, min_freq=4)
    sentim_analyzer.add_feat_extractor(extract_unigram_feats,
                                       unigrams=unigram_feats)
    # Apply features to obtain a feature-value representation of our datasets
    training_set = sentim_analyzer.apply_features(training_docs)
    test_set = sentim_analyzer.apply_features(testing_docs)

    classifier = sentim_analyzer.train(trainer, training_set)
    try:
        classifier.show_most_informative_features()
    except AttributeError:
        print(
            "Your classifier does not provide a show_most_informative_features() method."
        )
    results = sentim_analyzer.evaluate(test_set)

    if output:
        extr = [f.__name__ for f in sentim_analyzer.feat_extractors]
        output_markdown(
            output,
            Dataset="Movie_reviews",
            Classifier=type(classifier).__name__,
            Tokenizer="WordPunctTokenizer",
            Feats=extr,
            Results=results,
            Instances=n_instances,
        )
Beispiel #45
0
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews
 
def extract_features(word_list):
    return dict([(word, True) for word in word_list])
 
if __name__=='__main__':
    # Load positive and negative reviews  
    positive_fileids = movie_reviews.fileids('pos')
    negative_fileids = movie_reviews.fileids('neg')
     
    features_positive = [(extract_features(movie_reviews.words(fileids=[f])), 
            'Positive') for f in positive_fileids]
    features_negative = [(extract_features(movie_reviews.words(fileids=[f])), 
            'Negative') for f in negative_fileids]
     
    # Split the data into train and test (80/20)
    threshold_factor = 0.8
    threshold_positive = int(threshold_factor * len(features_positive))
    threshold_negative = int(threshold_factor * len(features_negative))
     
    features_train = features_positive[:threshold_positive] + features_negative[:threshold_negative]
    features_test = features_positive[threshold_positive:] + features_negative[threshold_negative:]  
    print "\nNumber of training datapoints:", len(features_train)
    print "Number of test datapoints:", len(features_test)
     
    # Train a Naive Bayes classifier
    classifier = NaiveBayesClassifier.train(features_train)
    print "\nAccuracy of the classifier:", nltk.classify.util.accuracy(classifier, features_test)
Beispiel #46
0
x_test = all_tagged_sents[train_size:]

tagger = nltk.UnigramTagger(train=x_train,backoff=nltk.DefaultTagger('n'))

tokens = nltk.word_tokenize(u'我 认为 不丹 的 被动 卷入 不 构成 此次 对峙 的 主要 因素。')
tagged = tagger.tag(tokens)
#["我", "R"], ["认为", "V"], ["不丹", "n"], ["的", "U"], ["被动", "A"], ["卷入", "V"], ["不", "D"], ["构成", "V"], ["此次", "R"], ["对峙", "V"], ["的", "U"], ["主要", "B"], ["因素。", "n"]
print (tagger.evaluate(x_test)) #0.871

####################################################NLTK学习之三:文本分类与构建基于分类的词性标注器
##1、文本分类示例
import random
import nltk
from nltk.corpus import movie_reviews

docs = [(list(movie_reviews.words(fileid)),category)
        for category in movie_reviews.categories()
        for fileid in movie_reviews.fileids(category)]

random.shuffle(docs)

all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
most_comment_word = [word for (word,_) in all_words.most_common(2000)]

def doc_feature(doc):
    doc_words = set(doc)
    feature = {}
    for word in most_comment_word:
        feature[word] = (word in doc_words)
    return feature
negative_fileids = movie_reviews.fileids('neg')


def build_bag(words):
    word_dict = {}
    for word in words:
        if word not in exclude:
            word = unicodedata.normalize(
                'NFKD', word).encode('ascii', 'ignore')
            word_dict[word] = 1
    return word_dict


pos_bag = []
for f in positive_fileids:
    pos_bag.append((build_bag(movie_reviews.words(fileids=[f])), 'pos'))

neg_bag = []
for f in negative_fileids:
    neg_bag.append((build_bag(movie_reviews.words(fileids=[f])), 'neg'))

split = 800
sentiment_classifier = NaiveBayesClassifier.train(
    pos_bag[:split] + neg_bag[:split])

train_accuracy = nltk.classify.util.accuracy(
    sentiment_classifier, pos_bag[:split] + neg_bag[:split])*100

test_accuracy = nltk.classify.util.accuracy(
    sentiment_classifier, pos_bag[split:] + neg_bag[split:])*100
import nltk
import random
from nltk.corpus import movie_reviews
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

documents = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        t = ()
        t = (list(movie_reviews.words(fileid)), category)
        documents.append(t)

random.shuffle(documents)

#print(documents[0])
all_words = []

for w in movie_reviews.words():
    all_words.append(w.lower())

all_words = nltk.FreqDist(all_words)
word_features = list(all_words.keys())[:3000]


def find_features(document):
    words = set(document)
    features = {}
    for w in word_features:
Beispiel #49
0
# -*- coding: utf-8 -*-
"""
Created on Wed Jun 28 07:44:58 2017

@author: Naruto_kathi
"""

import nltk
import random
from nltk.corpus import movie_reviews
import pickle

documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

random.shuffle(documents)

allwords = list(w.lower() for w in movie_reviews.words())

allwords = nltk.FreqDist(allwords)
#print(allwords.most_common(20))

word_features = list(allwords.keys())[:3000]
#print(word_features,"List of top 50 most common words")


def find_features(document):
    words = set(document)
    features = {}
    for w in word_features:
    dataset = list()
    for fileid in fileids:
        fileid_count = fileid_count + 1
        review = set(movie_reviews.words(fileid))
        features = dict()
        for word in featureset:
            features[word] = word in review
        pos_or_neg = fileid[:3]
        dataset.append((features, pos_or_neg))
    return dataset


# Collect all the words in the training examples
vocabulary = set()
for fileid in train_fileids:
    for word in movie_reviews.words(fileid):
        vocabulary.add(word)

# Try a feature set of 500 random words
vocabulary = list(vocabulary)
random.shuffle(vocabulary)
random_featureset = vocabulary[:500]

train_set = format_dataset(train_fileids, random_featureset)
test_set = format_dataset(test_fileids, random_featureset)
bayes = NaiveBayesClassifier.train(train_set)

print("Random words: ", random_featureset)
print("Naive Bayes accuracy:", accuracy(bayes, test_set))

# Try a feature set of the 500 words that appear most often in the training examples
Beispiel #51
0
#pylint: disable=C0103
'''Classifying a movie review based'''
import string
from itertools import chain

from nltk.corpus import movie_reviews as mr
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.classify import NaiveBayesClassifier as nbc
import nltk

stop = stopwords.words('english')
documents = [([w for w in mr.words(i) if w.lower() not in stop and w.lower() not in string.punctuation], i.split('/')[0]) for i in mr.fileids()]

word_features = FreqDist(chain(*[i for i,j in documents]))
word_features = list(word_features.keys())[:100]

numtrain = int(len(documents) * 90 / 100)
train_set = [({i:(i in tokens) for i in word_features}, tag) for tokens,tag in documents[:numtrain]]
test_set = [({i:(i in tokens) for i in word_features}, tag) for tokens,tag in documents[numtrain:]]

classifier = nbc.train(train_set)
print((nltk.classify.accuracy(classifier, test_set)))
classifier.show_most_informative_features(5)
# In[90]:

nltk.download("punkt")

# In[91]:

romeo_words = nltk.word_tokenize(romeo_text)

# In[92]:

romeo_words
#now the punctiuations are seperated nicely

# In[93]:

movie_reviews.words(fileids=positive_fileids[0])

# In[94]:

#simplest way tfor analysing text is to thing about words as an unordered collection of words
#dictionary
{word: True for word in romeo_words}

# In[95]:

type(_)

#    '_'    is the output from last code i.e. the line above

# In[96]:
Beispiel #53
0
    def train(self, feats):
        print "Starting to train the data"
        start = datetime.datetime.now()

        print "setting the ids", datetime.datetime.now()
        self.negids = movie_reviews.fileids('neg')
        self.posids = movie_reviews.fileids('pos')
        #random.shuffle(self.negids)
        #random.shuffle(self.posids)
        ##self.reviews = ([(movie_reviews.words(fileids=[f]), 'neg') for f in self.negids] +
        ##[(movie_reviews.words(fileids=[f]), 'pos') for f in self.posids])
        ##random.shuffle(self.reviews)

        ##self.train_set = apply_features(feats, self.reviews[len(self.reviews)*1/4:])
        ##self.test_set = apply_features(feats, self.reviews[:len(self.reviews)*1/4])

        print "setting the feats", datetime.datetime.now()
        self.negfeats = [(feats(movie_reviews.words(fileids=[f])), 'neg')
                         for f in self.negids]
        self.posfeats = [(feats(movie_reviews.words(fileids=[f])), 'pos')
                         for f in self.posids]

        self.negcutoff = len(self.negfeats) * 3 / 4
        self.poscutoff = len(self.posfeats) * 3 / 4

        print "setting the train/test", datetime.datetime.now()
        self.trainfeats = self.negfeats[:self.
                                        negcutoff] + self.posfeats[:self.
                                                                   poscutoff]
        self.testfeats = self.negfeats[self.negcutoff:] + self.posfeats[
            self.poscutoff:]

        print "training", datetime.datetime.now()
        self.classifier = NaiveBayesClassifier.train(self.trainfeats)
        ##self.classifier = NaiveBayesClassifier.train(self.train_set)
        self.refsets = defaultdict(set)
        self.testsets = defaultdict(set)

        print "accuracy stuff", datetime.datetime.now()
        for i, (feats, label) in enumerate(self.testfeats):
            ##for i, (feats, label) in enumerate(self.test_set):
            self.refsets[label].add(i)
            observed = self.classifier.classify(feats)
            self.testsets[observed].add(i)

        end = datetime.datetime.now()
        print "Training lasted for ", end - start

        print 'accuracy:', nltk.classify.util.accuracy(self.classifier,
                                                       self.testfeats)
        ##print 'accuracy:', nltk.classify.util.accuracy(self.classifier, self.test_set)
        print 'pos precision:', nltk.metrics.precision(self.refsets['pos'],
                                                       self.testsets['pos'])
        print 'pos recall:', nltk.metrics.recall(self.refsets['pos'],
                                                 self.testsets['pos'])
        print 'neg precision:', nltk.metrics.precision(self.refsets['neg'],
                                                       self.testsets['neg'])
        print 'neg recall:', nltk.metrics.recall(self.refsets['neg'],
                                                 self.testsets['neg'])
        self.classifier.show_most_informative_features()
        self.trained = True
# random.shuffle(documents)f

# print(documents[1])

for i in mr.fileids():
    documents[i.split('/')[0]].append(i)

random.shuffle(documents['pos'])
random.shuffle(documents['neg'])

#print(documents['pos'][:10]) # first ten pos reviews.
#print
#print(documents['neg'][:10]) # first ten neg reviews.

documents = [([
    w for w in mr.words(i)
    if w.lower() not in stop and w.lower() not in string.punctuation
], i.split('/')[0]) for i in mr.fileids()]

random.shuffle(documents)

allWords = []
for w in mr.words():
    allWords.append(w.lower())

allWords = nltk.FreqDist(allWords)

#print(allWords.most_common(15))
#print(allWords["stupid"])

wordFeatures = list(allWords.keys())[:3000]
Beispiel #55
0
# AUTHOR: GIRISH SRINIVAS

import nltk, random
from nltk.corpus import movie_reviews
from nltk.corpus import wordnet as wn

all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = all_words.keys()[:2000]


def document_features4a(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        if len(wn.synsets(word)) > 0:
            features['contains(%s)' % word] = 'KNOWN'
        else:
            features['contains(%s)' % word] = 'UNK'
    return features


def main():
    documents = [(list(movie_reviews.words(fileid)), category)
                 for category in movie_reviews.categories()
                 for fileid in movie_reviews.fileids(category)]
    random.shuffle(documents)
    featuresets = [(document_features4a(d), c) for (d, c) in documents]
    train_set, test_set = featuresets[100:], featuresets[:100]
    classifier = nltk.NaiveBayesClassifier.train(train_set)
    print nltk.classify.accuracy(classifier, test_set)
    classifier.show_most_informative_features(5)
Beispiel #56
0
import nltk
import random
from nltk.corpus import movie_reviews
import pickle
import os.path
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)

all_words = []
for w in movie_reviews.words():
    all_words.append(w.lower())

all_words = nltk.FreqDist(all_words)
word_features = list(all_words.keys())[:3000]


def find_features(document):
    words = set(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)
    return features

Beispiel #57
0
#TEXT CLASSIFICATION

import nltk
import random
from nltk.corpus import movie_reviews

documents = [
    (list(movie_reviews.words(fileid)), category
     )  #it contain all the words which has class label as positive or negative
    for category in movie_reviews.categories()
    for fileid in movie_reviews.fileids(category)
]

random.shuffle(
    documents
)  #bcz postive and negative example are together...1000 postivie than 1000 negative

all_words = []

for w in movie_reviews.words(
):  #append all the words in a list. so that we can classify.
    all_words.append(w.lower())

all_words = nltk.FreqDist(all_words)

word_features = list(all_words.keys())[:3000]


def find_features(document):
    words = set(document)
    features = {}
from nltk.corpus import movie_reviews 
from nltk.classify import NaiveBayesClassifier
from nltk.classify.util import accuracy as nltk_accuracy
 
# Extract features from the input list of words
def extract_features(words):
    return dict([(word, True) for word in words])
 
if __name__=='__main__':
    # Load the reviews from the corpus 
    fileids_pos = movie_reviews.fileids('pos')
    fileids_neg = movie_reviews.fileids('neg')
     
    # Extract the features from the reviews
    features_pos = [(extract_features(movie_reviews.words(
            fileids=[f])), 'Positive') for f in fileids_pos]
    features_neg = [(extract_features(movie_reviews.words(
            fileids=[f])), 'Negative') for f in fileids_neg]
     
    # Define the train and test split (80% and 20%)
    threshold = 0.8
    num_pos = int(threshold * len(features_pos))
    num_neg = int(threshold * len(features_neg))
     
     # Create training and training datasets
    features_train = features_pos[:num_pos] + features_neg[:num_neg]
    features_test = features_pos[num_pos:] + features_neg[num_neg:]  

    # Print the number of datapoints used
    print('\nNumber of training datapoints:', len(features_train))
    print('Number of test datapoints:', len(features_test))
Beispiel #59
0
import collections
import nltk.metrics
from nltk.metrics import precision, recall, f_measure
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews
from nltk.corpus import stopwords

import itertools
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.probability import FreqDist, ConditionalFreqDist

for word in movie_reviews.words(categories=['pos']):
    print(word)
Beispiel #60
0
import nltk
from nltk.corpus import movie_reviews
import random
# step 1 object set
reviews = [(movie_reviews.words(reviewid), clas) for clas in ['pos', 'neg']
           for reviewid in movie_reviews.fileids(clas)]

# step 1.5 get the most common 100 JJs
sample = random.sample(reviews, 200)
pos = [nltk.pos_tag(x) for x, y in sample]

jjlist = []
for i in pos:
    jj = [
        x[0] for x in i if "JJ" in x[1] and len(x[0]) >= 3 and x[0].isalpha()
    ]
    jjlist += jj

frequency = nltk.FreqDist(jjlist)
common_jj = [x for x, y in frequency.most_common(100)]


# step2 feature function
def feature(text):
    res = {}
    for i in common_jj:
        res[i] = i in set([x.lower() for x in text])
    return res


#step 3 generate the feature set