Esempio n. 1
0
class TestNgramCounter:
    """Tests for NgramCounter that only involve lookup, no modification."""
    @classmethod
    def setup_class(self):
        text = [list("abcd"), list("egdbe")]
        self.trigram_counter = NgramCounter(
            everygrams(sent, max_len=3) for sent in text)
        self.bigram_counter = NgramCounter(
            everygrams(sent, max_len=2) for sent in text)
        self.case = unittest.TestCase()

    def test_N(self):
        assert self.bigram_counter.N() == 16
        assert self.trigram_counter.N() == 21

    def test_counter_len_changes_with_lookup(self):
        assert len(self.bigram_counter) == 2
        self.bigram_counter[50]
        assert len(self.bigram_counter) == 3

    def test_ngram_order_access_unigrams(self):
        assert self.bigram_counter[1] == self.bigram_counter.unigrams

    def test_ngram_conditional_freqdist(self):
        case = unittest.TestCase()
        expected_trigram_contexts = [
            ("a", "b"),
            ("b", "c"),
            ("e", "g"),
            ("g", "d"),
            ("d", "b"),
        ]
        expected_bigram_contexts = [("a", ), ("b", ), ("d", ), ("e", ),
                                    ("c", ), ("g", )]

        bigrams = self.trigram_counter[2]
        trigrams = self.trigram_counter[3]

        self.case.assertCountEqual(expected_bigram_contexts,
                                   bigrams.conditions())
        self.case.assertCountEqual(expected_trigram_contexts,
                                   trigrams.conditions())
Esempio n. 2
0
from nltk.lm.preprocessing import flatten
from nltk.lm.preprocessing import padded_everygram_pipeline
from nltk.lm import MLE

from nltk.lm import Lidstone
from nltk.lm import Laplace
from nltk.lm import KneserNeyInterpolated

# Exercise 1

president_unigrams = {}

for president in inaugural.fileids():
    text_unigrams = [ngrams(sent, 1) for sent in inaugural.sents(president)]
    ngram_counts = NgramCounter(text_unigrams)
    president_unigrams[president] = ngram_counts.N()

inverse_unigrams = [(value, key) for key, value in president_unigrams.items()]
print(max(inverse_unigrams)[1],
      max(inverse_unigrams)[0])  #longest discourse for Harrison in 1841
print(min(inverse_unigrams)[1],
      min(inverse_unigrams)[0])  #shortest discourse for Washington in 1793

president_vocabulary = {}

for president in inaugural.fileids():
    vocab = Vocabulary(inaugural.words(president), unk_cutoff=2)
    president_vocabulary[president] = len(vocab)

inverse_vocabulary = [(value, key)
                      for key, value in president_vocabulary.items()]