Esempio n. 1
0
class LaplaceBigramTests(unittest.TestCase):
    """unit tests for Laplace class"""

    score_tests = [
        # basic sanity-check:
        # count(d | c) = 1
        # *count(d | c) = 2
        # Count(w | c for w in vocab) = 1
        # *Count(w | c for w in vocab) = 9
        ("d", ["c"], 2.0 / 9),
        # Total unigrams: 14
        # Vocab size: 8
        # Denominator: 14 + 8 = 22
        # count("a") = 2
        # *count("a") = 3
        ("a", None, 3.0 / 22),
        # in vocabulary but unseen
        # count("z") = 0
        # *count("z") = 1
        ("z", None, 1.0 / 22),
        # out of vocabulary should use "UNK" score
        # count("<UNK>") = 3
        # *count("<UNK>") = 4
        ("y", None, 4.0 / 22),
    ]

    def setUp(self):
        vocab, training_text = _prepare_test_data(2)
        self.model = Laplace(2, vocabulary=vocab)
        self.model.fit(training_text)

    def test_gamma(self):
        # Make sure the gamma is set to 1
        self.assertEqual(1, self.model.gamma)

    def test_entropy_perplexity(self):
        text = [
            ("<s>", "a"),
            ("a", "c"),
            ("c", "<UNK>"),
            ("<UNK>", "d"),
            ("d", "c"),
            ("c", "</s>"),
        ]
        # Unlike MLE this should be able to handle completely novel ngrams
        # Ngram = score, log score
        # <s>, a    = 0.2, -2.3219
        # a, c      = 0.1, -3.3219
        # c, UNK    = 0.(1), -3.1699
        # UNK, d    = 0.(09), 3.4594
        # d, c      = 0.1 -3.3219
        # c, </s>   = 0.(1), -3.1699
        # Total logscores: −18.7651
        # - AVG logscores: 3.1275
        H = 3.1275
        perplexity = 8.7393
        self.assertAlmostEqual(H, self.model.entropy(text), places=4)
        self.assertAlmostEqual(perplexity,
                               self.model.perplexity(text),
                               places=4)
Esempio n. 2
0
class LaplaceBigramTests(unittest.TestCase):
    """unit tests for Laplace class"""

    score_tests = [
        # basic sanity-check:
        # count(d | c) = 1
        # *count(d | c) = 2
        # Count(w | c for w in vocab) = 1
        # *Count(w | c for w in vocab) = 9
        ("d", ["c"], 2.0 / 9),
        # Total unigrams: 14
        # Vocab size: 8
        # Denominator: 14 + 8 = 22
        # count("a") = 2
        # *count("a") = 3
        ("a", None, 3.0 / 22),
        # in vocabulary but unseen
        # count("z") = 0
        # *count("z") = 1
        ("z", None, 1.0 / 22),
        # out of vocabulary should use "UNK" score
        # count("<UNK>") = 3
        # *count("<UNK>") = 4
        ("y", None, 4.0 / 22),
    ]

    def setUp(self):
        vocab, training_text = _prepare_test_data(2)
        self.model = Laplace(2, vocabulary=vocab)
        self.model.fit(training_text)

    def test_gamma(self):
        # Make sure the gamma is set to 1
        self.assertEqual(1, self.model.gamma)

    def test_entropy_perplexity(self):
        text = [
            ("<s>", "a"),
            ("a", "c"),
            ("c", "<UNK>"),
            ("<UNK>", "d"),
            ("d", "c"),
            ("c", "</s>"),
        ]
        # Unlike MLE this should be able to handle completely novel ngrams
        # Ngram = score, log score
        # <s>, a    = 0.2, -2.3219
        # a, c      = 0.1, -3.3219
        # c, UNK    = 0.(1), -3.1699
        # UNK, d    = 0.(09), 3.4594
        # d, c      = 0.1 -3.3219
        # c, </s>   = 0.(1), -3.1699
        # Total logscores: −18.7651
        # - AVG logscores: 3.1275
        H = 3.1275
        perplexity = 8.7393
        self.assertAlmostEqual(H, self.model.entropy(text), places=4)
        self.assertAlmostEqual(perplexity, self.model.perplexity(text), places=4)
Esempio n. 3
0
    def compute_pp(self, n, tokenized_train, tokenized_test):
        train_data, padded_sents = padded_everygram_pipeline(
            n, tokenized_train)
        test_data, padded_sents = padded_everygram_pipeline(n, tokenized_test)
        model = Laplace(1)
        model.fit(train_data, padded_sents)

        s = 0
        for i, test in enumerate(test_data):
            p = model.perplexity(test)
            s += p

        perplexity = s / (i + 1)
        return perplexity
Esempio n. 4
0
def vary_ngram(train_corpus, test_corpus, n_gram_orders):
    '''
    Use the nltk.lm.Laplace for training.
    Returns a dictionary of perplexity values at different order n-gram LMs

    :param train_corpus: list of list of str, corpus to train language model on.
    :param test_corpus: list of list of str, corpus to test language model on.
    :n_gram_orders: list of ints, orders of n-grams desired.
    :returns: a dictionary of perplexities at different orders, key=order, value=perplexity.
    :rtype: dict.

    Hint: Follow the same LM training procedure as in the notebook in the end of Exercise 1.
    '''

    test = sum([['<s>'] + x + ['</s>'] for x in test_corpus], [])
    ret = {}
    for order in n_gram_orders:
        train, vocab = padded_everygram_pipeline(order, train_corpus)
        lm = Laplace(order)
        lm.fit(train, vocab)
        ret[order] = lm.perplexity(test)
    return ret