コード例 #1
0
class TestSequenceFunctions(unittest.TestCase):
    def setUp(self):
        self.unk_cutoff = 2
        self.vocab = TfIdf(unk_cutoff=self.unk_cutoff)

    def test_vocab(self):
        self.vocab.train_seen("a", 300)

        self.vocab.train_seen("b")
        self.vocab.train_seen("c")
        self.vocab.finalize()

        # Infrequent words should look the same
        self.assertEqual(self.vocab.vocab_lookup("b"),
                         self.vocab.vocab_lookup("c"))

        # Infrequent words should look the same as never seen words
        self.assertEqual(self.vocab.vocab_lookup("b"),
                         self.vocab.vocab_lookup("d"), "")

        # The frequent word should be different from the infrequent word
        self.assertNotEqual(self.vocab.vocab_lookup("a"),
                            self.vocab.vocab_lookup("b"))

    def test_censor(self):
        self.vocab.train_seen("a", 300)

        self.vocab.train_seen("b")
        self.vocab.train_seen("c")
        self.vocab.finalize()

        censored_a = [str(x) for x in self.vocab.tokenize("a b d")]
        censored_b = [str(x) for x in self.vocab.tokenize("d b a")]
        censored_c = [str(x) for x in self.vocab.tokenize("a b d")]
        censored_d = [str(x) for x in self.vocab.tokenize("b d a")]

        self.assertEqual(censored_a, censored_c)
        self.assertEqual(censored_b, censored_d)

        # Should add start and end tag
        print(censored_a)
        self.assertEqual(len(censored_a), 3)
        self.assertEqual(censored_a[0], censored_b[2])
        self.assertEqual(censored_a[1], censored_b[0])

    def test_tf(self):
        self.vocab.train_seen("a", 300)
        self.vocab.finalize()

        self.vocab.add_document("a a b")

        # Test MLE
        word_a = self.vocab.vocab_lookup("a")
        word_b = self.vocab.vocab_lookup("b")
        word_c = self.vocab.vocab_lookup("c")

        self.assertAlmostEqual(self.vocab.term_freq(word_a), 0.66666666)
        self.assertAlmostEqual(self.vocab.term_freq(word_b), 0.33333333)
        self.assertAlmostEqual(self.vocab.term_freq(word_c), 0.33333333)

    def test_df(self):
        self.vocab.train_seen("a", 300)
        self.vocab.train_seen("b", 100)
        self.vocab.finalize()

        self.vocab.add_document("a a b")
        self.vocab.add_document("b b c")
        self.vocab.add_document("a a a")
        self.vocab.add_document("a a a")

        # Test MLE
        word_a = self.vocab.vocab_lookup("a")
        word_b = self.vocab.vocab_lookup("b")
        word_c = self.vocab.vocab_lookup("c")
        word_d = self.vocab.vocab_lookup("d")

        self.assertAlmostEqual(self.vocab.inv_docfreq(word_a),
                               log10(1.3333333))
        self.assertAlmostEqual(self.vocab.inv_docfreq(word_b), log10(2.0))
        self.assertAlmostEqual(self.vocab.inv_docfreq(word_c), log10(4.0))
        self.assertAlmostEqual(self.vocab.inv_docfreq(word_d), log10(4.0))