Example #1
0
 def test_init_no_training(self):
     classifier = MultinomialNB()
     assert classifier.vocabulary == set()
     assert classifier.labels == set()
     classifier.train(*self.training_docs)
     self.test_labels()
     self.test_vocabulary()
Example #2
0
class HierarchicalClassifier(Experiment):
    """Train a single classifier and return performance on a test set."""

    def _setup(self):
        self.subjective, self.polarity = MultinomialNB(), MultinomialNB()
        # self.subjective.top, self.polarity.top = 100000, 100000
        # subjective.prior = lambda x: 0.5
        # polarity.prior = lambda x: 0.5

    def _predict(self, features):
        label, probability = self.subjective.classify(features)
        if label == 'neutral':
            return label, probability
        else:
            return self.polarity.classify(features)

    def _train(self, features, label):
        if label != 'neutral':
            assert label in set(['positive', 'negative'])
            self.polarity.train((features, label))
            label = 'subjective'
        assert label in set(['neutral', 'subjective'])
        if sum(self.subjective._label_count[x] for x in self.subjective._label_count) < 8751:
            self.subjective.train((features, label))

    def pickle_dumps(self):
        pickled = Pickled(self.extractor, (self.subjective, self.polarity))
        return pickle.dumps(pickled, pickle.HIGHEST_PROTOCOL)
Example #3
0
 def test_init_no_training(self):
     classifier = MultinomialNB()
     assert classifier.vocabulary == set()
     assert classifier.labels == set()
     classifier.train(*self.training_docs)
     self.test_labels()
     self.test_vocabulary()
Example #4
0
 def setup(self):
     self.training_docs = [('Chinese Bejing Chinese', 'yes'),
                           ('Chinese Chinese Shanghai', 'yes'),
                           ('Chinese Macao', 'yes'),
                           ('Tokyo Japan Chinese', 'no')]
     self.training_docs = [(x.split(), y) for x, y in self.training_docs]
     self.classifier = MultinomialNB(*self.training_docs)
     self.make_snapshot()
Example #5
0
class SingleClassifier(Experiment):
    """Train a single classifier and return performance on a test set."""

    def _setup(self):
        self.nb = MultinomialNB()
        # self.nb.top = 100000

    def _predict(self, features):
        return self.nb.classify(features)

    def _train(self, features, label):
        self.nb.train((features, label))

    def pickle_dumps(self):
        pickled = Pickled(self.extractor, self.nb)
        return pickle.dumps(pickled, pickle.HIGHEST_PROTOCOL)
Example #6
0
 def setup(self):
     self.training_docs = [('Chinese Bejing Chinese', 'yes'),
                           ('Chinese Chinese Shanghai', 'yes'),
                           ('Chinese Macao', 'yes'),
                           ('Tokyo Japan Chinese', 'no')]
     self.training_docs = [(x.split(), y) for x, y in self.training_docs]
     self.classifier = MultinomialNB(*self.training_docs)
     self.make_snapshot()
Example #7
0
class OldClassifier(Experiment):
    def _setup(self):
        import old_classify

        def extract(x):
            old_classify.filter_text(x)
            old_classify.regularlize_text(x)
            features = old_classify.extract_features(x)
            return features
        self.extractor.extract = extract
        # tokens = old_classify.tokenizer.tokenize(x)
        # tokens = old_classify.regularlize_tokens(tokens)

        try:
            import cPickle as pickle
        except ImportError:
            import pickle
        # twitter-sentiment_classifier.1650000.pickle
        # with open(r"R:\_Other\Twitter\TwitterCorpus\results_sentiment\unbalanced_1-gram_stopword\twitter-sentiment_classifier.5000.pickle", mode='rb') as f:
        with open(r"R:\_Other\Twitter\TwitterCorpus\results_sentiment\unbalanced_1-gram_stopword\twitter-sentiment_classifier.1650000.pickle", mode='rb') as f:
            self.classifier = pickle.load(f)
        self.subjective = MultinomialNB()

    def _predict(self, features):
        label, probability = self.subjective.classify(features)
        if label == 'neutral':
            return label, probability
        sentiment = self.classifier.prob_classify(features)
        prob, neg, = sentiment.prob('pos'), sentiment.prob('neg')
        if prob > neg:
            return 'positive', prob
        else:
            return 'negative', neg

    def _train_func(self, features, label):
        if label != 'neutral':
            label = 'subjective'
        assert label in set(['neutral', 'subjective'])
        self.subjective.train((features, label))
Example #8
0
    def test_top_features(self):
        docs = [(['happy', 'joy', 'smile'], 'positive'),
                (['happy', 'joy', 'frown'], 'positive'),
                (['sad', 'frown', 'tired'], 'negative'),
                (['sad', 'tired', 'bored'], 'negative')]
        classifier = MultinomialNB()
        classifier.top_features = 2
        classifier.train(*docs)

        result = classifier._most_common['positive'].store
        assert result == {'happy': 2, 'joy': 2}
        result = classifier._most_common['negative'].store
        assert result == {'sad': 2, 'tired': 2}

        first = classifier.prob_all(['happy', 'smile'])
        second = classifier.prob_all(['happy', 'smile', 'smile'])
        assert first == second, classifier._most_common

        first = classifier.prob_all(['sad', 'tired'])['negative']
        second = classifier.prob_all(['sad', 'tired', 'frown'])['negative']
        assert first == second, classifier._most_common
Example #9
0
    def test_top_features(self):
        docs = [(['happy', 'joy', 'smile'], 'positive'),
                (['happy', 'joy', 'frown'], 'positive'),
                (['sad', 'frown', 'tired'], 'negative'),
                (['sad', 'tired', 'bored'], 'negative')]
        classifier = MultinomialNB()
        classifier.top_features = 2
        classifier.train(*docs)

        result = classifier._most_common['positive'].store
        assert result == {'happy': 2, 'joy': 2}
        result = classifier._most_common['negative'].store
        assert result == {'sad': 2, 'tired': 2}

        first = classifier.prob_all(['happy', 'smile'])
        second = classifier.prob_all(['happy', 'smile', 'smile'])
        assert first == second, classifier._most_common

        first = classifier.prob_all(['sad', 'tired'])['negative']
        second = classifier.prob_all(['sad', 'tired', 'frown'])['negative']
        assert first == second, classifier._most_common
Example #10
0
    def _setup(self):
        import old_classify

        def extract(x):
            old_classify.filter_text(x)
            old_classify.regularlize_text(x)
            features = old_classify.extract_features(x)
            return features
        self.extractor.extract = extract
        # tokens = old_classify.tokenizer.tokenize(x)
        # tokens = old_classify.regularlize_tokens(tokens)

        try:
            import cPickle as pickle
        except ImportError:
            import pickle
        # twitter-sentiment_classifier.1650000.pickle
        # with open(r"R:\_Other\Twitter\TwitterCorpus\results_sentiment\unbalanced_1-gram_stopword\twitter-sentiment_classifier.5000.pickle", mode='rb') as f:
        with open(r"R:\_Other\Twitter\TwitterCorpus\results_sentiment\unbalanced_1-gram_stopword\twitter-sentiment_classifier.1650000.pickle", mode='rb') as f:
            self.classifier = pickle.load(f)
        self.subjective = MultinomialNB()
Example #11
0
 def test_train_many_document(self):
     documents = [(['one', 'document', 'already', 'tokenized'], 'label')
                  ] * 5
     classifier = MultinomialNB(*documents)
     expected = set(['one', 'document', 'already', 'tokenized'])
     assert classifier.vocabulary == expected
Example #12
0
class TestMultinomialNB(object):
    # This test uses the examples provided by:
    # http://nlp.stanford.edu/IR-book/pdf/13bayes.pdf
    def setup(self):
        self.training_docs = [('Chinese Bejing Chinese', 'yes'),
                              ('Chinese Chinese Shanghai', 'yes'),
                              ('Chinese Macao', 'yes'),
                              ('Tokyo Japan Chinese', 'no')]
        self.training_docs = [(x.split(), y) for x, y in self.training_docs]
        self.classifier = MultinomialNB(*self.training_docs)
        self.make_snapshot()

    def teardown(self):
        self.assert_snapshot_identical()

    def make_snapshot(self):
        self.orig_label_count = deepcopy(self.classifier._label_count)
        self.orig_label_vocab = deepcopy(self.classifier._label_vocab)
        self.orig_label_feature_count = deepcopy(
            self.classifier._label_feature_count)
        self.orig_label_length = deepcopy(self.classifier._label_length)

    def assert_snapshot_identical(self):
        """Call if classifier's internals shouldn't have changed."""
        assert self.orig_label_count == self.classifier._label_count
        assert self.orig_label_vocab == self.classifier._label_vocab
        assert (self.orig_label_feature_count ==
                self.classifier._label_feature_count)
        assert self.orig_label_length == self.classifier._label_length

    def test_init_no_training(self):
        classifier = MultinomialNB()
        assert classifier.vocabulary == set()
        assert classifier.labels == set()
        classifier.train(*self.training_docs)
        self.test_labels()
        self.test_vocabulary()

    def test_train_one_document(self):
        documents = (['one', 'document', 'already', 'tokenized'], 'label')
        classifier = MultinomialNB(documents)
        expected = set(['one', 'document', 'already', 'tokenized'])
        assert classifier.vocabulary == expected

    def test_train_many_document(self):
        documents = [(['one', 'document', 'already', 'tokenized'], 'label')
                     ] * 5
        classifier = MultinomialNB(*documents)
        expected = set(['one', 'document', 'already', 'tokenized'])
        assert classifier.vocabulary == expected

    def test_train_not_tokenized(self):
        document = ('one document not tokenized', 'label')
        assert_raises(TypeError, self.classifier.train, document)

    def test_labels(self):
        expected = set(['yes', 'no'])
        assert self.classifier.labels == expected

    def test_vocabulary(self):
        expected = set(
            ['Chinese', 'Bejing', 'Shanghai', 'Macao', 'Tokyo', 'Japan'])
        assert self.classifier.vocabulary == expected

    def test_vocab_size(self):
        actual = len(self.classifier.vocabulary)
        result = self.classifier._vocab_size
        assert actual == result

    def test_label_feature_count(self):
        tests = [('yes', 'Chinese', 5), ('no', 'Chinese', 1),
                 ('no', 'Japan', 1)]
        for label, feature, count in tests:
            assert self.classifier._label_feature_count[label][
                feature] == count
        assert 'Japan' not in self.classifier._label_feature_count['yes']

    def test_prior(self):
        tests = [('yes', Fraction(3, 4)), ('no', Fraction(1, 4))]
        for label, prob in tests:
            self.classifier.exact = True
            result = self.classifier.prior(label)
            assert result == prob

            self.classifier.exact = False
            result = self.classifier.prior(label)
            prob = float(prob)
            assert_almost_equal(result, prob)

    def test_prior_unseen_label(self):
        assert_raises(KeyError, self.classifier.prior, '__unseen__')

    def test_conditional(self):
        tests = [('Chinese', 'yes', Fraction(6, 14)),
                 ('Japan', 'yes', Fraction(1, 14)),
                 ('Chinese', 'no', Fraction(2, 9)),
                 ('Tokyo', 'no', Fraction(2, 9)),
                 ('Japan', 'no', Fraction(2, 9)),
                 ('__invalid__', 'yes', Fraction(1, 14)),
                 ('__invalid__', 'no', Fraction(1, 9))]
        for feature, label, prob in tests:
            self.classifier.exact = True
            result = self.classifier.conditional(feature, label)
            assert result == prob

            self.classifier.exact = False
            result = self.classifier.conditional(feature, label)
            prob = float(prob)
            assert_almost_equal(result, prob)

    def test_conditional_laplace(self):
        self.classifier.laplace = 2
        tests = [('Chinese', 'yes', Fraction(7, 20)),
                 ('Japan', 'yes', Fraction(1, 10)),
                 ('Chinese', 'no', Fraction(1, 5)),
                 ('Tokyo', 'no', Fraction(1, 5)),
                 ('Japan', 'no', Fraction(1, 5)),
                 ('__invalid__', 'yes', Fraction(1, 10)),
                 ('__invalid__', 'no', Fraction(2, 15))]
        for feature, label, prob in tests:
            self.classifier.exact = True
            result = self.classifier.conditional(feature, label)
            assert result == prob

            self.classifier.exact = False
            result = self.classifier.conditional(feature, label)
            prob = float(prob)
            assert_almost_equal(result, prob)

    def test_conditional_unseen_feature(self):
        self.classifier.conditional('__unseen__', 'yes')
        assert '__unseen__' not in self.classifier._label_feature_count['yes']

    def test_conditional_unseen_label(self):
        assert_raises(KeyError, self.classifier.conditional, '__unseen__',
                      '__unseen__')
        assert '__unseen__' not in self.classifier._label_feature_count

    def test_score(self):
        tests = [('Chinese Chinese Chinese Tokyo Japan', 'yes',
                  Fraction(3, 4) * Fraction(3, 7) * Fraction(3, 7) *
                  Fraction(3, 7) * Fraction(1, 14) * Fraction(1, 14)),
                 ('Chinese Chinese Chinese Tokyo Japan', 'no',
                  Fraction(1, 4) * Fraction(2, 9) * Fraction(2, 9) *
                  Fraction(2, 9) * Fraction(2, 9) * Fraction(2, 9))]
        for document, label, score in tests:
            self.classifier.exact = True
            result = self.classifier._score(document.split(), label)
            assert result == score

            self.classifier.exact = False
            result = self.classifier._score(document.split(), label)
            result = math.exp(result)
            score = float(score)
            assert_almost_equal(result, score)

    def test_score_not_tokenized(self):
        document, label = 'Chinese Chinese Chinese Tokyo Japan', 'yes'
        assert_raises(TypeError, self.classifier._score, document, label)

    def test_prob(self):
        tests = [('Chinese Chinese Chinese Tokyo Japan', 'yes',
                  Fraction(4782969, 6934265)),
                 ('Chinese Chinese Chinese Tokyo Japan', 'no',
                  Fraction(2151296, 6934265))]
        for document, label, prob in tests:
            self.classifier.exact = True
            result = self.classifier.prob(document.split(), label)
            assert result == prob

            self.classifier.exact = False
            result = self.classifier.prob(document.split(), label)
            prob = float(prob)
            assert_almost_equal(result, prob)

    def test_prob_not_tokenized(self):
        document, label = 'Chinese Chinese Chinese Tokyo Japan', 'yes'
        assert_raises(TypeError, self.classifier.prob, document, label)

    def test_prob_all(self):
        document = 'Chinese Chinese Chinese Tokyo Japan'
        tests = [('yes', Fraction(4782969, 6934265)),
                 ('no', Fraction(2151296, 6934265))]
        for label, prob in tests:
            self.classifier.exact = True
            prob_all = self.classifier.prob_all(document.split())
            assert prob_all[label] == prob

            self.classifier.exact = False
            prob_all = self.classifier.prob_all(document.split())
            prob = float(prob)
            assert_almost_equal(prob_all[label], prob)

    def test_prob_all_not_tokenized(self):
        document = 'Chinese Chinese Chinese Tokyo Japan'
        assert_raises(TypeError, self.classifier.prob_all, document)

    def test_prob_all_near_zero(self):
        # Issue gh-14.
        document = 'Chinese Chinese Chinese Tokyo Japan ' * 1000
        self.classifier.exact = False
        self.classifier.prob_all(document.split())

    def test_classify(self):
        document = 'Chinese Chinese Chinese Tokyo Japan'.split()
        label, confidence = ('yes', float(Fraction(4782969, 6934265)))
        prediction = self.classifier.classify(document)
        # Tuple check
        assert prediction == (label, confidence)
        # Namedtuple check
        assert prediction.label == label
        assert_almost_equal(prediction.confidence, confidence)

    def test_classify_not_tokenized(self):
        document = 'Chinese Chinese Chinese Tokyo Japan'
        assert_raises(TypeError, self.classifier.classify, document)

    def test_top_features(self):
        docs = [(['happy', 'joy', 'smile'], 'positive'),
                (['happy', 'joy', 'frown'], 'positive'),
                (['sad', 'frown', 'tired'], 'negative'),
                (['sad', 'tired', 'bored'], 'negative')]
        classifier = MultinomialNB()
        classifier.top_features = 2
        classifier.train(*docs)

        result = classifier._most_common['positive'].store
        assert result == {'happy': 2, 'joy': 2}
        result = classifier._most_common['negative'].store
        assert result == {'sad': 2, 'tired': 2}

        first = classifier.prob_all(['happy', 'smile'])
        second = classifier.prob_all(['happy', 'smile', 'smile'])
        assert first == second, classifier._most_common

        first = classifier.prob_all(['sad', 'tired'])['negative']
        second = classifier.prob_all(['sad', 'tired', 'frown'])['negative']
        assert first == second, classifier._most_common
Example #13
0
 def _setup(self):
     self.nb = MultinomialNB()
Example #14
0
class TestMultinomialNB(object):
    # This test uses the examples provided by:
    # http://nlp.stanford.edu/IR-book/pdf/13bayes.pdf
    def setup(self):
        self.training_docs = [('Chinese Bejing Chinese', 'yes'),
                              ('Chinese Chinese Shanghai', 'yes'),
                              ('Chinese Macao', 'yes'),
                              ('Tokyo Japan Chinese', 'no')]
        self.training_docs = [(x.split(), y) for x, y in self.training_docs]
        self.classifier = MultinomialNB(*self.training_docs)
        self.make_snapshot()

    def teardown(self):
        self.assert_snapshot_identical()

    def make_snapshot(self):
        self.orig_label_count = deepcopy(self.classifier._label_count)
        self.orig_label_vocab = deepcopy(self.classifier._label_vocab)
        self.orig_label_feature_count = deepcopy(self.classifier
                                                 ._label_feature_count)
        self.orig_label_length = deepcopy(self.classifier._label_length)

    def assert_snapshot_identical(self):
        """Call if classifier's internals shouldn't have changed."""
        assert self.orig_label_count == self.classifier._label_count
        assert self.orig_label_vocab == self.classifier._label_vocab
        assert (self.orig_label_feature_count ==
                self.classifier._label_feature_count)
        assert self.orig_label_length == self.classifier._label_length

    def test_init_no_training(self):
        classifier = MultinomialNB()
        assert classifier.vocabulary == set()
        assert classifier.labels == set()
        classifier.train(*self.training_docs)
        self.test_labels()
        self.test_vocabulary()

    def test_train_one_document(self):
        documents = (['one', 'document', 'already', 'tokenized'], 'label')
        classifier = MultinomialNB(documents)
        expected = set(['one', 'document', 'already', 'tokenized'])
        assert classifier.vocabulary == expected

    def test_train_many_document(self):
        documents = [(['one', 'document', 'already', 'tokenized'], 'label')] * 5
        classifier = MultinomialNB(*documents)
        expected = set(['one', 'document', 'already', 'tokenized'])
        assert classifier.vocabulary == expected

    def test_train_not_tokenized(self):
        document = ('one document not tokenized', 'label')
        assert_raises(TypeError, self.classifier.train, document)

    def test_labels(self):
        expected = set(['yes', 'no'])
        assert self.classifier.labels == expected

    def test_vocabulary(self):
        expected = set(['Chinese', 'Bejing', 'Shanghai', 'Macao', 'Tokyo',
                        'Japan'])
        assert self.classifier.vocabulary == expected

    def test_vocab_size(self):
        actual = len(self.classifier.vocabulary)
        result = self.classifier._vocab_size
        assert actual == result

    def test_label_feature_count(self):
        tests = [('yes', 'Chinese', 5),
                 ('no', 'Chinese', 1),
                 ('no', 'Japan', 1)]
        for label, feature, count in tests:
            assert self.classifier._label_feature_count[label][feature] == count
        assert 'Japan' not in self.classifier._label_feature_count['yes']

    def test_prior(self):
        tests = [('yes', Fraction(3, 4)),
                 ('no', Fraction(1, 4))]
        for label, prob in tests:
            self.classifier.exact = True
            result = self.classifier.prior(label)
            assert result == prob

            self.classifier.exact = False
            result = self.classifier.prior(label)
            prob = float(prob)
            assert_almost_equal(result, prob)

    def test_prior_unseen_label(self):
        assert_raises(KeyError, self.classifier.prior, '__unseen__')

    def test_conditional(self):
        tests = [('Chinese', 'yes', Fraction(6, 14)),
                 ('Japan', 'yes', Fraction(1, 14)),
                 ('Chinese', 'no', Fraction(2, 9)),
                 ('Tokyo', 'no', Fraction(2, 9)),
                 ('Japan', 'no', Fraction(2, 9)),
                 ('__invalid__', 'yes', Fraction(1, 14)),
                 ('__invalid__', 'no', Fraction(1, 9))]
        for feature, label, prob in tests:
            self.classifier.exact = True
            result = self.classifier.conditional(feature, label)
            assert result == prob

            self.classifier.exact = False
            result = self.classifier.conditional(feature, label)
            prob = float(prob)
            assert_almost_equal(result, prob)

    def test_conditional_laplace(self):
        self.classifier.laplace = 2
        tests = [('Chinese', 'yes', Fraction(7, 20)),
                 ('Japan', 'yes', Fraction(1, 10)),
                 ('Chinese', 'no', Fraction(1, 5)),
                 ('Tokyo', 'no', Fraction(1, 5)),
                 ('Japan', 'no', Fraction(1, 5)),
                 ('__invalid__', 'yes', Fraction(1, 10)),
                 ('__invalid__', 'no', Fraction(2, 15))]
        for feature, label, prob in tests:
            self.classifier.exact = True
            result = self.classifier.conditional(feature, label)
            assert result == prob

            self.classifier.exact = False
            result = self.classifier.conditional(feature, label)
            prob = float(prob)
            assert_almost_equal(result, prob)

    def test_conditional_unseen_feature(self):
        self.classifier.conditional('__unseen__', 'yes')
        assert '__unseen__' not in self.classifier._label_feature_count['yes']

    def test_conditional_unseen_label(self):
        assert_raises(KeyError, self.classifier.conditional, '__unseen__',
                      '__unseen__')
        assert '__unseen__' not in self.classifier._label_feature_count

    def test_score(self):
        tests = [('Chinese Chinese Chinese Tokyo Japan', 'yes',
                  Fraction(3, 4) * Fraction(3, 7) * Fraction(3, 7) *
                  Fraction(3, 7) * Fraction(1, 14) * Fraction(1, 14)),
                 ('Chinese Chinese Chinese Tokyo Japan', 'no',
                  Fraction(1, 4) * Fraction(2, 9) * Fraction(2, 9) *
                  Fraction(2, 9) * Fraction(2, 9) * Fraction(2, 9))]
        for document, label, score in tests:
            self.classifier.exact = True
            result = self.classifier._score(document.split(), label)
            assert result == score

            self.classifier.exact = False
            result = self.classifier._score(document.split(), label)
            result = math.exp(result)
            score = float(score)
            assert_almost_equal(result, score)

    def test_score_not_tokenized(self):
        document, label = 'Chinese Chinese Chinese Tokyo Japan', 'yes'
        assert_raises(TypeError, self.classifier._score, document, label)

    def test_prob(self):
        tests = [('Chinese Chinese Chinese Tokyo Japan', 'yes',
                  Fraction(4782969, 6934265)),
                 ('Chinese Chinese Chinese Tokyo Japan', 'no',
                  Fraction(2151296, 6934265))]
        for document, label, prob in tests:
            self.classifier.exact = True
            result = self.classifier.prob(document.split(), label)
            assert result == prob

            self.classifier.exact = False
            result = self.classifier.prob(document.split(), label)
            prob = float(prob)
            assert_almost_equal(result, prob)

    def test_prob_not_tokenized(self):
        document, label = 'Chinese Chinese Chinese Tokyo Japan', 'yes'
        assert_raises(TypeError, self.classifier.prob, document, label)

    def test_prob_all(self):
        document = 'Chinese Chinese Chinese Tokyo Japan'
        tests = [('yes', Fraction(4782969, 6934265)),
                 ('no', Fraction(2151296, 6934265))]
        for label, prob in tests:
            self.classifier.exact = True
            prob_all = self.classifier.prob_all(document.split())
            assert prob_all[label] == prob

            self.classifier.exact = False
            prob_all = self.classifier.prob_all(document.split())
            prob = float(prob)
            assert_almost_equal(prob_all[label], prob)

    def test_prob_all_not_tokenized(self):
        document = 'Chinese Chinese Chinese Tokyo Japan'
        assert_raises(TypeError, self.classifier.prob_all, document)

    def test_prob_all_near_zero(self):
        # Issue gh-14.
        document = 'Chinese Chinese Chinese Tokyo Japan ' * 1000
        self.classifier.exact = False
        self.classifier.prob_all(document.split())

    def test_classify(self):
        document = 'Chinese Chinese Chinese Tokyo Japan'.split()
        label, confidence = ('yes', float(Fraction(4782969, 6934265)))
        prediction = self.classifier.classify(document)
        # Tuple check
        assert prediction == (label, confidence)
        # Namedtuple check
        assert prediction.label == label
        assert_almost_equal(prediction.confidence, confidence)

    def test_classify_not_tokenized(self):
        document = 'Chinese Chinese Chinese Tokyo Japan'
        assert_raises(TypeError, self.classifier.classify, document)

    def test_top_features(self):
        docs = [(['happy', 'joy', 'smile'], 'positive'),
                (['happy', 'joy', 'frown'], 'positive'),
                (['sad', 'frown', 'tired'], 'negative'),
                (['sad', 'tired', 'bored'], 'negative')]
        classifier = MultinomialNB()
        classifier.top_features = 2
        classifier.train(*docs)

        result = classifier._most_common['positive'].store
        assert result == {'happy': 2, 'joy': 2}
        result = classifier._most_common['negative'].store
        assert result == {'sad': 2, 'tired': 2}

        first = classifier.prob_all(['happy', 'smile'])
        second = classifier.prob_all(['happy', 'smile', 'smile'])
        assert first == second, classifier._most_common

        first = classifier.prob_all(['sad', 'tired'])['negative']
        second = classifier.prob_all(['sad', 'tired', 'frown'])['negative']
        assert first == second, classifier._most_common
Example #15
0
 def _setup(self):
     self.subjective, self.polarity = MultinomialNB(), MultinomialNB()
Example #16
0
 def test_ngrams_multinomialnb(self):
     # Integration test with Naive Bayes classifier.
     classifier = MultinomialNB()
     self.extractor.min_n, self.extractor.max_n = 1, 3
     features = self.extractor.extract(self.document)
     classifier.train([features, 'positive'])
Example #17
0
 def test_ngrams_multinomialnb(self):
     # Integration test with Naive Bayes classifier.
     classifier = MultinomialNB()
     self.extractor.min_n, self.extractor.max_n = 1, 3
     features = self.extractor.extract(self.document)
     classifier.train([features, "positive"])