Beispiel #1
0
    def test_top_features(self):
        docs = [(['happy', 'joy', 'smile'], 'positive'),
                (['happy', 'joy', 'frown'], 'positive'),
                (['sad', 'frown', 'tired'], 'negative'),
                (['sad', 'tired', 'bored'], 'negative')]
        classifier = MultinomialNB()
        classifier.top_features = 2
        classifier.train(*docs)

        result = classifier._most_common['positive'].store
        assert result == {'happy': 2, 'joy': 2}
        result = classifier._most_common['negative'].store
        assert result == {'sad': 2, 'tired': 2}

        first = classifier.prob_all(['happy', 'smile'])
        second = classifier.prob_all(['happy', 'smile', 'smile'])
        assert first == second, classifier._most_common

        first = classifier.prob_all(['sad', 'tired'])['negative']
        second = classifier.prob_all(['sad', 'tired', 'frown'])['negative']
        assert first == second, classifier._most_common
Beispiel #2
0
    def test_top_features(self):
        docs = [(['happy', 'joy', 'smile'], 'positive'),
                (['happy', 'joy', 'frown'], 'positive'),
                (['sad', 'frown', 'tired'], 'negative'),
                (['sad', 'tired', 'bored'], 'negative')]
        classifier = MultinomialNB()
        classifier.top_features = 2
        classifier.train(*docs)

        result = classifier._most_common['positive'].store
        assert result == {'happy': 2, 'joy': 2}
        result = classifier._most_common['negative'].store
        assert result == {'sad': 2, 'tired': 2}

        first = classifier.prob_all(['happy', 'smile'])
        second = classifier.prob_all(['happy', 'smile', 'smile'])
        assert first == second, classifier._most_common

        first = classifier.prob_all(['sad', 'tired'])['negative']
        second = classifier.prob_all(['sad', 'tired', 'frown'])['negative']
        assert first == second, classifier._most_common
Beispiel #3
0
class TestMultinomialNB(object):
    # This test uses the examples provided by:
    # http://nlp.stanford.edu/IR-book/pdf/13bayes.pdf
    def setup(self):
        self.training_docs = [('Chinese Bejing Chinese', 'yes'),
                              ('Chinese Chinese Shanghai', 'yes'),
                              ('Chinese Macao', 'yes'),
                              ('Tokyo Japan Chinese', 'no')]
        self.training_docs = [(x.split(), y) for x, y in self.training_docs]
        self.classifier = MultinomialNB(*self.training_docs)
        self.make_snapshot()

    def teardown(self):
        self.assert_snapshot_identical()

    def make_snapshot(self):
        self.orig_label_count = deepcopy(self.classifier._label_count)
        self.orig_label_vocab = deepcopy(self.classifier._label_vocab)
        self.orig_label_feature_count = deepcopy(
            self.classifier._label_feature_count)
        self.orig_label_length = deepcopy(self.classifier._label_length)

    def assert_snapshot_identical(self):
        """Call if classifier's internals shouldn't have changed."""
        assert self.orig_label_count == self.classifier._label_count
        assert self.orig_label_vocab == self.classifier._label_vocab
        assert (self.orig_label_feature_count ==
                self.classifier._label_feature_count)
        assert self.orig_label_length == self.classifier._label_length

    def test_init_no_training(self):
        classifier = MultinomialNB()
        assert classifier.vocabulary == set()
        assert classifier.labels == set()
        classifier.train(*self.training_docs)
        self.test_labels()
        self.test_vocabulary()

    def test_train_one_document(self):
        documents = (['one', 'document', 'already', 'tokenized'], 'label')
        classifier = MultinomialNB(documents)
        expected = set(['one', 'document', 'already', 'tokenized'])
        assert classifier.vocabulary == expected

    def test_train_many_document(self):
        documents = [(['one', 'document', 'already', 'tokenized'], 'label')
                     ] * 5
        classifier = MultinomialNB(*documents)
        expected = set(['one', 'document', 'already', 'tokenized'])
        assert classifier.vocabulary == expected

    def test_train_not_tokenized(self):
        document = ('one document not tokenized', 'label')
        assert_raises(TypeError, self.classifier.train, document)

    def test_labels(self):
        expected = set(['yes', 'no'])
        assert self.classifier.labels == expected

    def test_vocabulary(self):
        expected = set(
            ['Chinese', 'Bejing', 'Shanghai', 'Macao', 'Tokyo', 'Japan'])
        assert self.classifier.vocabulary == expected

    def test_vocab_size(self):
        actual = len(self.classifier.vocabulary)
        result = self.classifier._vocab_size
        assert actual == result

    def test_label_feature_count(self):
        tests = [('yes', 'Chinese', 5), ('no', 'Chinese', 1),
                 ('no', 'Japan', 1)]
        for label, feature, count in tests:
            assert self.classifier._label_feature_count[label][
                feature] == count
        assert 'Japan' not in self.classifier._label_feature_count['yes']

    def test_prior(self):
        tests = [('yes', Fraction(3, 4)), ('no', Fraction(1, 4))]
        for label, prob in tests:
            self.classifier.exact = True
            result = self.classifier.prior(label)
            assert result == prob

            self.classifier.exact = False
            result = self.classifier.prior(label)
            prob = float(prob)
            assert_almost_equal(result, prob)

    def test_prior_unseen_label(self):
        assert_raises(KeyError, self.classifier.prior, '__unseen__')

    def test_conditional(self):
        tests = [('Chinese', 'yes', Fraction(6, 14)),
                 ('Japan', 'yes', Fraction(1, 14)),
                 ('Chinese', 'no', Fraction(2, 9)),
                 ('Tokyo', 'no', Fraction(2, 9)),
                 ('Japan', 'no', Fraction(2, 9)),
                 ('__invalid__', 'yes', Fraction(1, 14)),
                 ('__invalid__', 'no', Fraction(1, 9))]
        for feature, label, prob in tests:
            self.classifier.exact = True
            result = self.classifier.conditional(feature, label)
            assert result == prob

            self.classifier.exact = False
            result = self.classifier.conditional(feature, label)
            prob = float(prob)
            assert_almost_equal(result, prob)

    def test_conditional_laplace(self):
        self.classifier.laplace = 2
        tests = [('Chinese', 'yes', Fraction(7, 20)),
                 ('Japan', 'yes', Fraction(1, 10)),
                 ('Chinese', 'no', Fraction(1, 5)),
                 ('Tokyo', 'no', Fraction(1, 5)),
                 ('Japan', 'no', Fraction(1, 5)),
                 ('__invalid__', 'yes', Fraction(1, 10)),
                 ('__invalid__', 'no', Fraction(2, 15))]
        for feature, label, prob in tests:
            self.classifier.exact = True
            result = self.classifier.conditional(feature, label)
            assert result == prob

            self.classifier.exact = False
            result = self.classifier.conditional(feature, label)
            prob = float(prob)
            assert_almost_equal(result, prob)

    def test_conditional_unseen_feature(self):
        self.classifier.conditional('__unseen__', 'yes')
        assert '__unseen__' not in self.classifier._label_feature_count['yes']

    def test_conditional_unseen_label(self):
        assert_raises(KeyError, self.classifier.conditional, '__unseen__',
                      '__unseen__')
        assert '__unseen__' not in self.classifier._label_feature_count

    def test_score(self):
        tests = [('Chinese Chinese Chinese Tokyo Japan', 'yes',
                  Fraction(3, 4) * Fraction(3, 7) * Fraction(3, 7) *
                  Fraction(3, 7) * Fraction(1, 14) * Fraction(1, 14)),
                 ('Chinese Chinese Chinese Tokyo Japan', 'no',
                  Fraction(1, 4) * Fraction(2, 9) * Fraction(2, 9) *
                  Fraction(2, 9) * Fraction(2, 9) * Fraction(2, 9))]
        for document, label, score in tests:
            self.classifier.exact = True
            result = self.classifier._score(document.split(), label)
            assert result == score

            self.classifier.exact = False
            result = self.classifier._score(document.split(), label)
            result = math.exp(result)
            score = float(score)
            assert_almost_equal(result, score)

    def test_score_not_tokenized(self):
        document, label = 'Chinese Chinese Chinese Tokyo Japan', 'yes'
        assert_raises(TypeError, self.classifier._score, document, label)

    def test_prob(self):
        tests = [('Chinese Chinese Chinese Tokyo Japan', 'yes',
                  Fraction(4782969, 6934265)),
                 ('Chinese Chinese Chinese Tokyo Japan', 'no',
                  Fraction(2151296, 6934265))]
        for document, label, prob in tests:
            self.classifier.exact = True
            result = self.classifier.prob(document.split(), label)
            assert result == prob

            self.classifier.exact = False
            result = self.classifier.prob(document.split(), label)
            prob = float(prob)
            assert_almost_equal(result, prob)

    def test_prob_not_tokenized(self):
        document, label = 'Chinese Chinese Chinese Tokyo Japan', 'yes'
        assert_raises(TypeError, self.classifier.prob, document, label)

    def test_prob_all(self):
        document = 'Chinese Chinese Chinese Tokyo Japan'
        tests = [('yes', Fraction(4782969, 6934265)),
                 ('no', Fraction(2151296, 6934265))]
        for label, prob in tests:
            self.classifier.exact = True
            prob_all = self.classifier.prob_all(document.split())
            assert prob_all[label] == prob

            self.classifier.exact = False
            prob_all = self.classifier.prob_all(document.split())
            prob = float(prob)
            assert_almost_equal(prob_all[label], prob)

    def test_prob_all_not_tokenized(self):
        document = 'Chinese Chinese Chinese Tokyo Japan'
        assert_raises(TypeError, self.classifier.prob_all, document)

    def test_prob_all_near_zero(self):
        # Issue gh-14.
        document = 'Chinese Chinese Chinese Tokyo Japan ' * 1000
        self.classifier.exact = False
        self.classifier.prob_all(document.split())

    def test_classify(self):
        document = 'Chinese Chinese Chinese Tokyo Japan'.split()
        label, confidence = ('yes', float(Fraction(4782969, 6934265)))
        prediction = self.classifier.classify(document)
        # Tuple check
        assert prediction == (label, confidence)
        # Namedtuple check
        assert prediction.label == label
        assert_almost_equal(prediction.confidence, confidence)

    def test_classify_not_tokenized(self):
        document = 'Chinese Chinese Chinese Tokyo Japan'
        assert_raises(TypeError, self.classifier.classify, document)

    def test_top_features(self):
        docs = [(['happy', 'joy', 'smile'], 'positive'),
                (['happy', 'joy', 'frown'], 'positive'),
                (['sad', 'frown', 'tired'], 'negative'),
                (['sad', 'tired', 'bored'], 'negative')]
        classifier = MultinomialNB()
        classifier.top_features = 2
        classifier.train(*docs)

        result = classifier._most_common['positive'].store
        assert result == {'happy': 2, 'joy': 2}
        result = classifier._most_common['negative'].store
        assert result == {'sad': 2, 'tired': 2}

        first = classifier.prob_all(['happy', 'smile'])
        second = classifier.prob_all(['happy', 'smile', 'smile'])
        assert first == second, classifier._most_common

        first = classifier.prob_all(['sad', 'tired'])['negative']
        second = classifier.prob_all(['sad', 'tired', 'frown'])['negative']
        assert first == second, classifier._most_common
Beispiel #4
0
class TestMultinomialNB(object):
    # This test uses the examples provided by:
    # http://nlp.stanford.edu/IR-book/pdf/13bayes.pdf
    def setup(self):
        self.training_docs = [('Chinese Bejing Chinese', 'yes'),
                              ('Chinese Chinese Shanghai', 'yes'),
                              ('Chinese Macao', 'yes'),
                              ('Tokyo Japan Chinese', 'no')]
        self.training_docs = [(x.split(), y) for x, y in self.training_docs]
        self.classifier = MultinomialNB(*self.training_docs)
        self.make_snapshot()

    def teardown(self):
        self.assert_snapshot_identical()

    def make_snapshot(self):
        self.orig_label_count = deepcopy(self.classifier._label_count)
        self.orig_label_vocab = deepcopy(self.classifier._label_vocab)
        self.orig_label_feature_count = deepcopy(self.classifier
                                                 ._label_feature_count)
        self.orig_label_length = deepcopy(self.classifier._label_length)

    def assert_snapshot_identical(self):
        """Call if classifier's internals shouldn't have changed."""
        assert self.orig_label_count == self.classifier._label_count
        assert self.orig_label_vocab == self.classifier._label_vocab
        assert (self.orig_label_feature_count ==
                self.classifier._label_feature_count)
        assert self.orig_label_length == self.classifier._label_length

    def test_init_no_training(self):
        classifier = MultinomialNB()
        assert classifier.vocabulary == set()
        assert classifier.labels == set()
        classifier.train(*self.training_docs)
        self.test_labels()
        self.test_vocabulary()

    def test_train_one_document(self):
        documents = (['one', 'document', 'already', 'tokenized'], 'label')
        classifier = MultinomialNB(documents)
        expected = set(['one', 'document', 'already', 'tokenized'])
        assert classifier.vocabulary == expected

    def test_train_many_document(self):
        documents = [(['one', 'document', 'already', 'tokenized'], 'label')] * 5
        classifier = MultinomialNB(*documents)
        expected = set(['one', 'document', 'already', 'tokenized'])
        assert classifier.vocabulary == expected

    def test_train_not_tokenized(self):
        document = ('one document not tokenized', 'label')
        assert_raises(TypeError, self.classifier.train, document)

    def test_labels(self):
        expected = set(['yes', 'no'])
        assert self.classifier.labels == expected

    def test_vocabulary(self):
        expected = set(['Chinese', 'Bejing', 'Shanghai', 'Macao', 'Tokyo',
                        'Japan'])
        assert self.classifier.vocabulary == expected

    def test_vocab_size(self):
        actual = len(self.classifier.vocabulary)
        result = self.classifier._vocab_size
        assert actual == result

    def test_label_feature_count(self):
        tests = [('yes', 'Chinese', 5),
                 ('no', 'Chinese', 1),
                 ('no', 'Japan', 1)]
        for label, feature, count in tests:
            assert self.classifier._label_feature_count[label][feature] == count
        assert 'Japan' not in self.classifier._label_feature_count['yes']

    def test_prior(self):
        tests = [('yes', Fraction(3, 4)),
                 ('no', Fraction(1, 4))]
        for label, prob in tests:
            self.classifier.exact = True
            result = self.classifier.prior(label)
            assert result == prob

            self.classifier.exact = False
            result = self.classifier.prior(label)
            prob = float(prob)
            assert_almost_equal(result, prob)

    def test_prior_unseen_label(self):
        assert_raises(KeyError, self.classifier.prior, '__unseen__')

    def test_conditional(self):
        tests = [('Chinese', 'yes', Fraction(6, 14)),
                 ('Japan', 'yes', Fraction(1, 14)),
                 ('Chinese', 'no', Fraction(2, 9)),
                 ('Tokyo', 'no', Fraction(2, 9)),
                 ('Japan', 'no', Fraction(2, 9)),
                 ('__invalid__', 'yes', Fraction(1, 14)),
                 ('__invalid__', 'no', Fraction(1, 9))]
        for feature, label, prob in tests:
            self.classifier.exact = True
            result = self.classifier.conditional(feature, label)
            assert result == prob

            self.classifier.exact = False
            result = self.classifier.conditional(feature, label)
            prob = float(prob)
            assert_almost_equal(result, prob)

    def test_conditional_laplace(self):
        self.classifier.laplace = 2
        tests = [('Chinese', 'yes', Fraction(7, 20)),
                 ('Japan', 'yes', Fraction(1, 10)),
                 ('Chinese', 'no', Fraction(1, 5)),
                 ('Tokyo', 'no', Fraction(1, 5)),
                 ('Japan', 'no', Fraction(1, 5)),
                 ('__invalid__', 'yes', Fraction(1, 10)),
                 ('__invalid__', 'no', Fraction(2, 15))]
        for feature, label, prob in tests:
            self.classifier.exact = True
            result = self.classifier.conditional(feature, label)
            assert result == prob

            self.classifier.exact = False
            result = self.classifier.conditional(feature, label)
            prob = float(prob)
            assert_almost_equal(result, prob)

    def test_conditional_unseen_feature(self):
        self.classifier.conditional('__unseen__', 'yes')
        assert '__unseen__' not in self.classifier._label_feature_count['yes']

    def test_conditional_unseen_label(self):
        assert_raises(KeyError, self.classifier.conditional, '__unseen__',
                      '__unseen__')
        assert '__unseen__' not in self.classifier._label_feature_count

    def test_score(self):
        tests = [('Chinese Chinese Chinese Tokyo Japan', 'yes',
                  Fraction(3, 4) * Fraction(3, 7) * Fraction(3, 7) *
                  Fraction(3, 7) * Fraction(1, 14) * Fraction(1, 14)),
                 ('Chinese Chinese Chinese Tokyo Japan', 'no',
                  Fraction(1, 4) * Fraction(2, 9) * Fraction(2, 9) *
                  Fraction(2, 9) * Fraction(2, 9) * Fraction(2, 9))]
        for document, label, score in tests:
            self.classifier.exact = True
            result = self.classifier._score(document.split(), label)
            assert result == score

            self.classifier.exact = False
            result = self.classifier._score(document.split(), label)
            result = math.exp(result)
            score = float(score)
            assert_almost_equal(result, score)

    def test_score_not_tokenized(self):
        document, label = 'Chinese Chinese Chinese Tokyo Japan', 'yes'
        assert_raises(TypeError, self.classifier._score, document, label)

    def test_prob(self):
        tests = [('Chinese Chinese Chinese Tokyo Japan', 'yes',
                  Fraction(4782969, 6934265)),
                 ('Chinese Chinese Chinese Tokyo Japan', 'no',
                  Fraction(2151296, 6934265))]
        for document, label, prob in tests:
            self.classifier.exact = True
            result = self.classifier.prob(document.split(), label)
            assert result == prob

            self.classifier.exact = False
            result = self.classifier.prob(document.split(), label)
            prob = float(prob)
            assert_almost_equal(result, prob)

    def test_prob_not_tokenized(self):
        document, label = 'Chinese Chinese Chinese Tokyo Japan', 'yes'
        assert_raises(TypeError, self.classifier.prob, document, label)

    def test_prob_all(self):
        document = 'Chinese Chinese Chinese Tokyo Japan'
        tests = [('yes', Fraction(4782969, 6934265)),
                 ('no', Fraction(2151296, 6934265))]
        for label, prob in tests:
            self.classifier.exact = True
            prob_all = self.classifier.prob_all(document.split())
            assert prob_all[label] == prob

            self.classifier.exact = False
            prob_all = self.classifier.prob_all(document.split())
            prob = float(prob)
            assert_almost_equal(prob_all[label], prob)

    def test_prob_all_not_tokenized(self):
        document = 'Chinese Chinese Chinese Tokyo Japan'
        assert_raises(TypeError, self.classifier.prob_all, document)

    def test_prob_all_near_zero(self):
        # Issue gh-14.
        document = 'Chinese Chinese Chinese Tokyo Japan ' * 1000
        self.classifier.exact = False
        self.classifier.prob_all(document.split())

    def test_classify(self):
        document = 'Chinese Chinese Chinese Tokyo Japan'.split()
        label, confidence = ('yes', float(Fraction(4782969, 6934265)))
        prediction = self.classifier.classify(document)
        # Tuple check
        assert prediction == (label, confidence)
        # Namedtuple check
        assert prediction.label == label
        assert_almost_equal(prediction.confidence, confidence)

    def test_classify_not_tokenized(self):
        document = 'Chinese Chinese Chinese Tokyo Japan'
        assert_raises(TypeError, self.classifier.classify, document)

    def test_top_features(self):
        docs = [(['happy', 'joy', 'smile'], 'positive'),
                (['happy', 'joy', 'frown'], 'positive'),
                (['sad', 'frown', 'tired'], 'negative'),
                (['sad', 'tired', 'bored'], 'negative')]
        classifier = MultinomialNB()
        classifier.top_features = 2
        classifier.train(*docs)

        result = classifier._most_common['positive'].store
        assert result == {'happy': 2, 'joy': 2}
        result = classifier._most_common['negative'].store
        assert result == {'sad': 2, 'tired': 2}

        first = classifier.prob_all(['happy', 'smile'])
        second = classifier.prob_all(['happy', 'smile', 'smile'])
        assert first == second, classifier._most_common

        first = classifier.prob_all(['sad', 'tired'])['negative']
        second = classifier.prob_all(['sad', 'tired', 'frown'])['negative']
        assert first == second, classifier._most_common