class LaplaceBigramTests(unittest.TestCase): """unit tests for Laplace class""" score_tests = [ # basic sanity-check: # count(d | c) = 1 # *count(d | c) = 2 # Count(w | c for w in vocab) = 1 # *Count(w | c for w in vocab) = 9 ("d", ["c"], 2.0 / 9), # Total unigrams: 14 # Vocab size: 8 # Denominator: 14 + 8 = 22 # count("a") = 2 # *count("a") = 3 ("a", None, 3.0 / 22), # in vocabulary but unseen # count("z") = 0 # *count("z") = 1 ("z", None, 1.0 / 22), # out of vocabulary should use "UNK" score # count("<UNK>") = 3 # *count("<UNK>") = 4 ("y", None, 4.0 / 22), ] def setUp(self): vocab, training_text = _prepare_test_data(2) self.model = Laplace(2, vocabulary=vocab) self.model.fit(training_text) def test_gamma(self): # Make sure the gamma is set to 1 self.assertEqual(1, self.model.gamma) def test_entropy_perplexity(self): text = [ ("<s>", "a"), ("a", "c"), ("c", "<UNK>"), ("<UNK>", "d"), ("d", "c"), ("c", "</s>"), ] # Unlike MLE this should be able to handle completely novel ngrams # Ngram = score, log score # <s>, a = 0.2, -2.3219 # a, c = 0.1, -3.3219 # c, UNK = 0.(1), -3.1699 # UNK, d = 0.(09), 3.4594 # d, c = 0.1 -3.3219 # c, </s> = 0.(1), -3.1699 # Total logscores: −18.7651 # - AVG logscores: 3.1275 H = 3.1275 perplexity = 8.7393 self.assertAlmostEqual(H, self.model.entropy(text), places=4) self.assertAlmostEqual(perplexity, self.model.perplexity(text), places=4)
from nltk.lm import Vocabulary vocab = list(Vocabulary(vocab, unk_cutoff=1)) ''' from nltk.lm.preprocessing import padded_everygram_pipeline train, vocab = padded_everygram_pipeline(2, text) ''' lm = Laplace(3) lm.fit([bigramsList], vocabulary_text=list(vocab)) lm.generate(4, text_seed=["government", "had"]) def generateSentences(v): sent = v v = [lm.generate(1, text_seed=v)] sent = sent + v while v[0] != '</s>': l = len(sent) v = [lm.generate(1, text_seed=[sent[l - 2], sent[l - 1]])] sent = sent + v return sent sen = generateSentences(['<s>', 'india']) sen = " ".join(sen) print(sen) x = [] lm.entropy(bigramsList)