class TestNgramCounter: """Tests for NgramCounter that only involve lookup, no modification.""" @classmethod def setup_class(self): text = [list("abcd"), list("egdbe")] self.trigram_counter = NgramCounter( everygrams(sent, max_len=3) for sent in text) self.bigram_counter = NgramCounter( everygrams(sent, max_len=2) for sent in text) self.case = unittest.TestCase() def test_N(self): assert self.bigram_counter.N() == 16 assert self.trigram_counter.N() == 21 def test_counter_len_changes_with_lookup(self): assert len(self.bigram_counter) == 2 self.bigram_counter[50] assert len(self.bigram_counter) == 3 def test_ngram_order_access_unigrams(self): assert self.bigram_counter[1] == self.bigram_counter.unigrams def test_ngram_conditional_freqdist(self): case = unittest.TestCase() expected_trigram_contexts = [ ("a", "b"), ("b", "c"), ("e", "g"), ("g", "d"), ("d", "b"), ] expected_bigram_contexts = [("a", ), ("b", ), ("d", ), ("e", ), ("c", ), ("g", )] bigrams = self.trigram_counter[2] trigrams = self.trigram_counter[3] self.case.assertCountEqual(expected_bigram_contexts, bigrams.conditions()) self.case.assertCountEqual(expected_trigram_contexts, trigrams.conditions())
from nltk.lm.preprocessing import flatten from nltk.lm.preprocessing import padded_everygram_pipeline from nltk.lm import MLE from nltk.lm import Lidstone from nltk.lm import Laplace from nltk.lm import KneserNeyInterpolated # Exercise 1 president_unigrams = {} for president in inaugural.fileids(): text_unigrams = [ngrams(sent, 1) for sent in inaugural.sents(president)] ngram_counts = NgramCounter(text_unigrams) president_unigrams[president] = ngram_counts.N() inverse_unigrams = [(value, key) for key, value in president_unigrams.items()] print(max(inverse_unigrams)[1], max(inverse_unigrams)[0]) #longest discourse for Harrison in 1841 print(min(inverse_unigrams)[1], min(inverse_unigrams)[0]) #shortest discourse for Washington in 1793 president_vocabulary = {} for president in inaugural.fileids(): vocab = Vocabulary(inaugural.words(president), unk_cutoff=2) president_vocabulary[president] = len(vocab) inverse_vocabulary = [(value, key) for key, value in president_vocabulary.items()]