def test_spam_probability(self): training_set = [("foo bar", 1), ("bar baz", 0)] counts = nb.count_words(training_set) word_probs = nb.word_probabilities(counts, 1, 1) message = "there is no foo bar like bar bar" actual = nb.spam_probability(word_probs, message) expected = 0.9 self.assertAlmostEqual(actual, expected)
def train(self, training_set): # count spam and non-spam messages num_spams = len([is_spam for message, is_spam in training_set # pylint: disable=unused-variable if is_spam]) num_non_spams = len(training_set) - num_spams # run training data through our "pipeline" word_counts = nb.count_words(training_set) self.word_probs = nb.word_probabilities(word_counts, num_spams, num_non_spams, self.k)
def test_word_probabilities(self): training_set = [("foo bar", 1), ("bar baz", 0)] counts = nb.count_words(training_set) actual = nb.word_probabilities(counts, 1, 1) expected = [("baz", 0.25, 0.75), ("foo", 0.75, 0.25), ("bar", 0.75, 0.75)] self.assertEqual(actual, expected)
def test_count_words(self): training_set = [("foo bar", 1)] actual = nb.count_words(training_set) expected = {"foo": [1, 0], "bar": [1, 0]} self.assertEqual(actual, expected)