def test_top_features(self): docs = [(['happy', 'joy', 'smile'], 'positive'), (['happy', 'joy', 'frown'], 'positive'), (['sad', 'frown', 'tired'], 'negative'), (['sad', 'tired', 'bored'], 'negative')] classifier = MultinomialNB() classifier.top_features = 2 classifier.train(*docs) result = classifier._most_common['positive'].store assert result == {'happy': 2, 'joy': 2} result = classifier._most_common['negative'].store assert result == {'sad': 2, 'tired': 2} first = classifier.prob_all(['happy', 'smile']) second = classifier.prob_all(['happy', 'smile', 'smile']) assert first == second, classifier._most_common first = classifier.prob_all(['sad', 'tired'])['negative'] second = classifier.prob_all(['sad', 'tired', 'frown'])['negative'] assert first == second, classifier._most_common
class TestMultinomialNB(object): # This test uses the examples provided by: # http://nlp.stanford.edu/IR-book/pdf/13bayes.pdf def setup(self): self.training_docs = [('Chinese Bejing Chinese', 'yes'), ('Chinese Chinese Shanghai', 'yes'), ('Chinese Macao', 'yes'), ('Tokyo Japan Chinese', 'no')] self.training_docs = [(x.split(), y) for x, y in self.training_docs] self.classifier = MultinomialNB(*self.training_docs) self.make_snapshot() def teardown(self): self.assert_snapshot_identical() def make_snapshot(self): self.orig_label_count = deepcopy(self.classifier._label_count) self.orig_label_vocab = deepcopy(self.classifier._label_vocab) self.orig_label_feature_count = deepcopy( self.classifier._label_feature_count) self.orig_label_length = deepcopy(self.classifier._label_length) def assert_snapshot_identical(self): """Call if classifier's internals shouldn't have changed.""" assert self.orig_label_count == self.classifier._label_count assert self.orig_label_vocab == self.classifier._label_vocab assert (self.orig_label_feature_count == self.classifier._label_feature_count) assert self.orig_label_length == self.classifier._label_length def test_init_no_training(self): classifier = MultinomialNB() assert classifier.vocabulary == set() assert classifier.labels == set() classifier.train(*self.training_docs) self.test_labels() self.test_vocabulary() def test_train_one_document(self): documents = (['one', 'document', 'already', 'tokenized'], 'label') classifier = MultinomialNB(documents) expected = set(['one', 'document', 'already', 'tokenized']) assert classifier.vocabulary == expected def test_train_many_document(self): documents = [(['one', 'document', 'already', 'tokenized'], 'label') ] * 5 classifier = MultinomialNB(*documents) expected = set(['one', 'document', 'already', 'tokenized']) assert classifier.vocabulary == expected def test_train_not_tokenized(self): document = ('one document not tokenized', 'label') assert_raises(TypeError, self.classifier.train, document) def test_labels(self): expected = set(['yes', 'no']) assert self.classifier.labels == expected def test_vocabulary(self): expected = set( ['Chinese', 'Bejing', 'Shanghai', 'Macao', 'Tokyo', 'Japan']) assert self.classifier.vocabulary == expected def test_vocab_size(self): actual = len(self.classifier.vocabulary) result = self.classifier._vocab_size assert actual == result def test_label_feature_count(self): tests = [('yes', 'Chinese', 5), ('no', 'Chinese', 1), ('no', 'Japan', 1)] for label, feature, count in tests: assert self.classifier._label_feature_count[label][ feature] == count assert 'Japan' not in self.classifier._label_feature_count['yes'] def test_prior(self): tests = [('yes', Fraction(3, 4)), ('no', Fraction(1, 4))] for label, prob in tests: self.classifier.exact = True result = self.classifier.prior(label) assert result == prob self.classifier.exact = False result = self.classifier.prior(label) prob = float(prob) assert_almost_equal(result, prob) def test_prior_unseen_label(self): assert_raises(KeyError, self.classifier.prior, '__unseen__') def test_conditional(self): tests = [('Chinese', 'yes', Fraction(6, 14)), ('Japan', 'yes', Fraction(1, 14)), ('Chinese', 'no', Fraction(2, 9)), ('Tokyo', 'no', Fraction(2, 9)), ('Japan', 'no', Fraction(2, 9)), ('__invalid__', 'yes', Fraction(1, 14)), ('__invalid__', 'no', Fraction(1, 9))] for feature, label, prob in tests: self.classifier.exact = True result = self.classifier.conditional(feature, label) assert result == prob self.classifier.exact = False result = self.classifier.conditional(feature, label) prob = float(prob) assert_almost_equal(result, prob) def test_conditional_laplace(self): self.classifier.laplace = 2 tests = [('Chinese', 'yes', Fraction(7, 20)), ('Japan', 'yes', Fraction(1, 10)), ('Chinese', 'no', Fraction(1, 5)), ('Tokyo', 'no', Fraction(1, 5)), ('Japan', 'no', Fraction(1, 5)), ('__invalid__', 'yes', Fraction(1, 10)), ('__invalid__', 'no', Fraction(2, 15))] for feature, label, prob in tests: self.classifier.exact = True result = self.classifier.conditional(feature, label) assert result == prob self.classifier.exact = False result = self.classifier.conditional(feature, label) prob = float(prob) assert_almost_equal(result, prob) def test_conditional_unseen_feature(self): self.classifier.conditional('__unseen__', 'yes') assert '__unseen__' not in self.classifier._label_feature_count['yes'] def test_conditional_unseen_label(self): assert_raises(KeyError, self.classifier.conditional, '__unseen__', '__unseen__') assert '__unseen__' not in self.classifier._label_feature_count def test_score(self): tests = [('Chinese Chinese Chinese Tokyo Japan', 'yes', Fraction(3, 4) * Fraction(3, 7) * Fraction(3, 7) * Fraction(3, 7) * Fraction(1, 14) * Fraction(1, 14)), ('Chinese Chinese Chinese Tokyo Japan', 'no', Fraction(1, 4) * Fraction(2, 9) * Fraction(2, 9) * Fraction(2, 9) * Fraction(2, 9) * Fraction(2, 9))] for document, label, score in tests: self.classifier.exact = True result = self.classifier._score(document.split(), label) assert result == score self.classifier.exact = False result = self.classifier._score(document.split(), label) result = math.exp(result) score = float(score) assert_almost_equal(result, score) def test_score_not_tokenized(self): document, label = 'Chinese Chinese Chinese Tokyo Japan', 'yes' assert_raises(TypeError, self.classifier._score, document, label) def test_prob(self): tests = [('Chinese Chinese Chinese Tokyo Japan', 'yes', Fraction(4782969, 6934265)), ('Chinese Chinese Chinese Tokyo Japan', 'no', Fraction(2151296, 6934265))] for document, label, prob in tests: self.classifier.exact = True result = self.classifier.prob(document.split(), label) assert result == prob self.classifier.exact = False result = self.classifier.prob(document.split(), label) prob = float(prob) assert_almost_equal(result, prob) def test_prob_not_tokenized(self): document, label = 'Chinese Chinese Chinese Tokyo Japan', 'yes' assert_raises(TypeError, self.classifier.prob, document, label) def test_prob_all(self): document = 'Chinese Chinese Chinese Tokyo Japan' tests = [('yes', Fraction(4782969, 6934265)), ('no', Fraction(2151296, 6934265))] for label, prob in tests: self.classifier.exact = True prob_all = self.classifier.prob_all(document.split()) assert prob_all[label] == prob self.classifier.exact = False prob_all = self.classifier.prob_all(document.split()) prob = float(prob) assert_almost_equal(prob_all[label], prob) def test_prob_all_not_tokenized(self): document = 'Chinese Chinese Chinese Tokyo Japan' assert_raises(TypeError, self.classifier.prob_all, document) def test_prob_all_near_zero(self): # Issue gh-14. document = 'Chinese Chinese Chinese Tokyo Japan ' * 1000 self.classifier.exact = False self.classifier.prob_all(document.split()) def test_classify(self): document = 'Chinese Chinese Chinese Tokyo Japan'.split() label, confidence = ('yes', float(Fraction(4782969, 6934265))) prediction = self.classifier.classify(document) # Tuple check assert prediction == (label, confidence) # Namedtuple check assert prediction.label == label assert_almost_equal(prediction.confidence, confidence) def test_classify_not_tokenized(self): document = 'Chinese Chinese Chinese Tokyo Japan' assert_raises(TypeError, self.classifier.classify, document) def test_top_features(self): docs = [(['happy', 'joy', 'smile'], 'positive'), (['happy', 'joy', 'frown'], 'positive'), (['sad', 'frown', 'tired'], 'negative'), (['sad', 'tired', 'bored'], 'negative')] classifier = MultinomialNB() classifier.top_features = 2 classifier.train(*docs) result = classifier._most_common['positive'].store assert result == {'happy': 2, 'joy': 2} result = classifier._most_common['negative'].store assert result == {'sad': 2, 'tired': 2} first = classifier.prob_all(['happy', 'smile']) second = classifier.prob_all(['happy', 'smile', 'smile']) assert first == second, classifier._most_common first = classifier.prob_all(['sad', 'tired'])['negative'] second = classifier.prob_all(['sad', 'tired', 'frown'])['negative'] assert first == second, classifier._most_common
class TestMultinomialNB(object): # This test uses the examples provided by: # http://nlp.stanford.edu/IR-book/pdf/13bayes.pdf def setup(self): self.training_docs = [('Chinese Bejing Chinese', 'yes'), ('Chinese Chinese Shanghai', 'yes'), ('Chinese Macao', 'yes'), ('Tokyo Japan Chinese', 'no')] self.training_docs = [(x.split(), y) for x, y in self.training_docs] self.classifier = MultinomialNB(*self.training_docs) self.make_snapshot() def teardown(self): self.assert_snapshot_identical() def make_snapshot(self): self.orig_label_count = deepcopy(self.classifier._label_count) self.orig_label_vocab = deepcopy(self.classifier._label_vocab) self.orig_label_feature_count = deepcopy(self.classifier ._label_feature_count) self.orig_label_length = deepcopy(self.classifier._label_length) def assert_snapshot_identical(self): """Call if classifier's internals shouldn't have changed.""" assert self.orig_label_count == self.classifier._label_count assert self.orig_label_vocab == self.classifier._label_vocab assert (self.orig_label_feature_count == self.classifier._label_feature_count) assert self.orig_label_length == self.classifier._label_length def test_init_no_training(self): classifier = MultinomialNB() assert classifier.vocabulary == set() assert classifier.labels == set() classifier.train(*self.training_docs) self.test_labels() self.test_vocabulary() def test_train_one_document(self): documents = (['one', 'document', 'already', 'tokenized'], 'label') classifier = MultinomialNB(documents) expected = set(['one', 'document', 'already', 'tokenized']) assert classifier.vocabulary == expected def test_train_many_document(self): documents = [(['one', 'document', 'already', 'tokenized'], 'label')] * 5 classifier = MultinomialNB(*documents) expected = set(['one', 'document', 'already', 'tokenized']) assert classifier.vocabulary == expected def test_train_not_tokenized(self): document = ('one document not tokenized', 'label') assert_raises(TypeError, self.classifier.train, document) def test_labels(self): expected = set(['yes', 'no']) assert self.classifier.labels == expected def test_vocabulary(self): expected = set(['Chinese', 'Bejing', 'Shanghai', 'Macao', 'Tokyo', 'Japan']) assert self.classifier.vocabulary == expected def test_vocab_size(self): actual = len(self.classifier.vocabulary) result = self.classifier._vocab_size assert actual == result def test_label_feature_count(self): tests = [('yes', 'Chinese', 5), ('no', 'Chinese', 1), ('no', 'Japan', 1)] for label, feature, count in tests: assert self.classifier._label_feature_count[label][feature] == count assert 'Japan' not in self.classifier._label_feature_count['yes'] def test_prior(self): tests = [('yes', Fraction(3, 4)), ('no', Fraction(1, 4))] for label, prob in tests: self.classifier.exact = True result = self.classifier.prior(label) assert result == prob self.classifier.exact = False result = self.classifier.prior(label) prob = float(prob) assert_almost_equal(result, prob) def test_prior_unseen_label(self): assert_raises(KeyError, self.classifier.prior, '__unseen__') def test_conditional(self): tests = [('Chinese', 'yes', Fraction(6, 14)), ('Japan', 'yes', Fraction(1, 14)), ('Chinese', 'no', Fraction(2, 9)), ('Tokyo', 'no', Fraction(2, 9)), ('Japan', 'no', Fraction(2, 9)), ('__invalid__', 'yes', Fraction(1, 14)), ('__invalid__', 'no', Fraction(1, 9))] for feature, label, prob in tests: self.classifier.exact = True result = self.classifier.conditional(feature, label) assert result == prob self.classifier.exact = False result = self.classifier.conditional(feature, label) prob = float(prob) assert_almost_equal(result, prob) def test_conditional_laplace(self): self.classifier.laplace = 2 tests = [('Chinese', 'yes', Fraction(7, 20)), ('Japan', 'yes', Fraction(1, 10)), ('Chinese', 'no', Fraction(1, 5)), ('Tokyo', 'no', Fraction(1, 5)), ('Japan', 'no', Fraction(1, 5)), ('__invalid__', 'yes', Fraction(1, 10)), ('__invalid__', 'no', Fraction(2, 15))] for feature, label, prob in tests: self.classifier.exact = True result = self.classifier.conditional(feature, label) assert result == prob self.classifier.exact = False result = self.classifier.conditional(feature, label) prob = float(prob) assert_almost_equal(result, prob) def test_conditional_unseen_feature(self): self.classifier.conditional('__unseen__', 'yes') assert '__unseen__' not in self.classifier._label_feature_count['yes'] def test_conditional_unseen_label(self): assert_raises(KeyError, self.classifier.conditional, '__unseen__', '__unseen__') assert '__unseen__' not in self.classifier._label_feature_count def test_score(self): tests = [('Chinese Chinese Chinese Tokyo Japan', 'yes', Fraction(3, 4) * Fraction(3, 7) * Fraction(3, 7) * Fraction(3, 7) * Fraction(1, 14) * Fraction(1, 14)), ('Chinese Chinese Chinese Tokyo Japan', 'no', Fraction(1, 4) * Fraction(2, 9) * Fraction(2, 9) * Fraction(2, 9) * Fraction(2, 9) * Fraction(2, 9))] for document, label, score in tests: self.classifier.exact = True result = self.classifier._score(document.split(), label) assert result == score self.classifier.exact = False result = self.classifier._score(document.split(), label) result = math.exp(result) score = float(score) assert_almost_equal(result, score) def test_score_not_tokenized(self): document, label = 'Chinese Chinese Chinese Tokyo Japan', 'yes' assert_raises(TypeError, self.classifier._score, document, label) def test_prob(self): tests = [('Chinese Chinese Chinese Tokyo Japan', 'yes', Fraction(4782969, 6934265)), ('Chinese Chinese Chinese Tokyo Japan', 'no', Fraction(2151296, 6934265))] for document, label, prob in tests: self.classifier.exact = True result = self.classifier.prob(document.split(), label) assert result == prob self.classifier.exact = False result = self.classifier.prob(document.split(), label) prob = float(prob) assert_almost_equal(result, prob) def test_prob_not_tokenized(self): document, label = 'Chinese Chinese Chinese Tokyo Japan', 'yes' assert_raises(TypeError, self.classifier.prob, document, label) def test_prob_all(self): document = 'Chinese Chinese Chinese Tokyo Japan' tests = [('yes', Fraction(4782969, 6934265)), ('no', Fraction(2151296, 6934265))] for label, prob in tests: self.classifier.exact = True prob_all = self.classifier.prob_all(document.split()) assert prob_all[label] == prob self.classifier.exact = False prob_all = self.classifier.prob_all(document.split()) prob = float(prob) assert_almost_equal(prob_all[label], prob) def test_prob_all_not_tokenized(self): document = 'Chinese Chinese Chinese Tokyo Japan' assert_raises(TypeError, self.classifier.prob_all, document) def test_prob_all_near_zero(self): # Issue gh-14. document = 'Chinese Chinese Chinese Tokyo Japan ' * 1000 self.classifier.exact = False self.classifier.prob_all(document.split()) def test_classify(self): document = 'Chinese Chinese Chinese Tokyo Japan'.split() label, confidence = ('yes', float(Fraction(4782969, 6934265))) prediction = self.classifier.classify(document) # Tuple check assert prediction == (label, confidence) # Namedtuple check assert prediction.label == label assert_almost_equal(prediction.confidence, confidence) def test_classify_not_tokenized(self): document = 'Chinese Chinese Chinese Tokyo Japan' assert_raises(TypeError, self.classifier.classify, document) def test_top_features(self): docs = [(['happy', 'joy', 'smile'], 'positive'), (['happy', 'joy', 'frown'], 'positive'), (['sad', 'frown', 'tired'], 'negative'), (['sad', 'tired', 'bored'], 'negative')] classifier = MultinomialNB() classifier.top_features = 2 classifier.train(*docs) result = classifier._most_common['positive'].store assert result == {'happy': 2, 'joy': 2} result = classifier._most_common['negative'].store assert result == {'sad': 2, 'tired': 2} first = classifier.prob_all(['happy', 'smile']) second = classifier.prob_all(['happy', 'smile', 'smile']) assert first == second, classifier._most_common first = classifier.prob_all(['sad', 'tired'])['negative'] second = classifier.prob_all(['sad', 'tired', 'frown'])['negative'] assert first == second, classifier._most_common