コード例 #1
0
    def test_ngram_text_generator_bad_num_input(self):  # new test
        """
        throws errors with bad inputs
        """
        bad_inputs = (-5, 0, -2, -1, -8)
        corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is',
                  'bruno', '<END>')
        word_storage = WordStorage()
        word_storage.update(corpus)
        encoded = encode_text(word_storage, corpus)
        trie = NGramTrie(2, encoded)
        generator = NGramTextGenerator(word_storage, trie)

        for bad_input in bad_inputs:
            self.assertRaises(ValueError, generator._generate_sentence,
                              bad_input)
コード例 #2
0
    def test_ngram_text_generator_throws_errors(self):
        """
        throws errors with bad inputs
        """
        bad_inputs = [[], {}, None, 9, 9.34, True]
        corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is',
                  'bruno', '<END>')
        word_storage = WordStorage()
        word_storage.update(corpus)
        encoded = encode_text(word_storage, corpus)
        trie = NGramTrie(2, encoded)
        generator = NGramTextGenerator(word_storage, trie)

        for bad_input in bad_inputs:
            self.assertRaises(ValueError, generator._generate_sentence,
                              bad_input)
コード例 #3
0
 def test_get_most_frequent_gram_bad_inputs(self):
     """
     Checks that method returns empty tuple
     """
     corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is',
               'bruno', '<END>')
     word_storage = WordStorage()
     word_storage.update(corpus)
     encoded = encode_text(word_storage, corpus)
     ngram = NGramTrie(3, encoded)
     expected = ()
     generator = NGramTextGenerator(word_storage, ngram)
     bad_inputs = [[], {}, (), None, 9, 9.34, True]
     for bad_input in bad_inputs:
         self.assertEqual(expected,
                          generator.get_most_frequent_gram(bad_input))
コード例 #4
0
    def test_ngram_text_generator_generate_next_word(self):
        """
        Checks that next word generates properly
        """
        corpus = ('i', 'have', 'a', 'cat', '<END>', 'i', 'have', 'a', 'bruno',
                  'cat', '<END>')
        word_storage = WordStorage()
        word_storage.update(corpus)
        encoded = encode_text(word_storage, corpus)
        ngram = NGramTrie(3, encoded)

        generator = NGramTextGenerator(word_storage, ngram)
        context = (word_storage.get_id('i'), word_storage.get_id('have'))
        expected = word_storage.get_id('a')
        actual = generator._generate_next_word(context)
        self.assertEqual(expected, actual)
    def test_ngram_text_generator_generate_next_word_incorrect_context(self):
        """
        Checks that method throws error
        """
        corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is',
                  'bruno', '<END>')
        word_storage = WordStorage()
        word_storage.update(corpus)
        encoded = encode_text(word_storage, corpus)
        ngram = NGramTrie(3, encoded)

        generator = NGramTextGenerator(word_storage, ngram)
        bad_inputs = [[], {}, (3, ), None, 9, 9.34,
                      True]  # (3, ) - it is incorrect sized ngram
        for bad_input in bad_inputs:
            self.assertRaises(ValueError, generator._generate_next_word,
                              bad_input)
    def test_ngram_text_generator_generate_sentence_properly(self):
        """
        generates correct output according to simple case
        """
        corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is',
                  'bruno', '<END>')
        word_storage = WordStorage()
        word_storage.update(corpus)
        encoded = encode_text(word_storage, corpus)
        trie = NGramTrie(2, encoded)
        context = (word_storage.get_id('i'), )

        end = word_storage.get_id('<END>')

        generator = NGramTextGenerator(word_storage, trie)
        actual = generator._generate_sentence(context)
        self.assertEqual(actual[-1], end)
コード例 #7
0
    def test_get_most_frequent_gram_ideal(self):
        """
        Checks that most frequent ngram gets properly
        """
        corpus = ('i', 'have', 'a', 'cat', '<END>', 'i', 'have', 'a', 'bruno',
                  'cat', '<END>')
        word_storage = WordStorage()
        word_storage.update(corpus)
        encoded = encode_text(word_storage, corpus)
        ngram = NGramTrie(3, encoded)

        generator = NGramTextGenerator(word_storage, ngram)
        context = (word_storage.get_id('i'), word_storage.get_id('have'))
        expected = (word_storage.get_id('i'), word_storage.get_id('have'),
                    word_storage.get_id('a'))
        actual = generator.get_most_frequent_gram(context)
        self.assertEqual(expected, actual)
コード例 #8
0
    def test_decode_text_upper_first_letter(self):
        '''
        Tests that number all the letters except
            first one in a sentence are in a lower case
        '''
        corpus = ('first', 'sentence', 'here', '<END>', 'second', 'sentence',
                  'here', '<END>', 'third', 'sentence', 'here', '<END>')

        storage = WordStorage()
        storage.update(corpus)

        encoded_text = encode_text(storage, corpus)
        trie = NGramTrie(3, encoded_text)
        context = (storage.get_id('first'), storage.get_id('sentence'))

        likelihood_generator = LikelihoodBasedTextGenerator(storage, trie)
        generated_encoded_text = likelihood_generator.generate_text(context, 1)
        decoded_text = decode_text(storage, generated_encoded_text)
        self.assertFalse(decoded_text[0][1:].isupper())
    def test_ngram_text_generator_generate_sentence_no_end(self):
        """
        should generate '<END>' anyway
        """
        corpus = ('i', 'have', 'a', 'cat', 'his', 'name', 'is', 'bruno', 'i',
                  'have', 'a', 'dog', 'too', 'his', 'name', 'is', 'rex', 'her',
                  'name', 'is', 'rex', 'too', '<END>')
        word_storage = WordStorage()
        word_storage.update(corpus)
        encoded = encode_text(word_storage, corpus)
        trie = NGramTrie(2, encoded)
        context = (word_storage.get_id('cat'), )

        generator = NGramTextGenerator(word_storage, trie)
        actual = generator._generate_sentence(context)

        expected = '<END>'
        actual = word_storage.get_word(actual[-1])
        self.assertEqual(expected, actual)
コード例 #10
0
    def test_get_most_frequent_gram_no_such_context(self):
        """
        Checks that returns empty tuple with no context in the corpus
        """
        corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is',
                  'bruno', '<END>')
        word_storage = WordStorage()
        word_storage.update(corpus)
        encoded = encode_text(word_storage, corpus)
        ngram = NGramTrie(3, encoded)
        generator = NGramTextGenerator(word_storage, ngram)

        context = (
            word_storage.get_id('i'),
            word_storage.get_id('name'),
        )  # there is no such context in ngrams
        expected = ()
        actual = generator.get_most_frequent_gram(context)
        self.assertEqual(expected, actual)
コード例 #11
0
    def test_load_model_has_generator_methods(self):
        corpus = ('i', 'have', 'a', 'cat', '<END>',
                  'his', 'name', 'is', 'bruno', '<END>',
                  'i', 'have', 'a', 'dog', 'too', '<END>',
                  'his', 'name', 'is', 'rex', '<END>',
                  'her', 'name', 'is', 'rex', 'too', '<END>')

        storage = WordStorage()
        storage.update(corpus)

        encoded = encode_text(storage, corpus)

        trie = NGramTrie(2, encoded)

        generator = NGramTextGenerator(storage, trie)

        save_model(generator, 'my_awesome_model')
        loaded_model = load_model('my_awesome_model')

        self.assertEquals(dir(loaded_model), dir(generator))
コード例 #12
0
    def test_save_model_ideal(self):
        corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is',
                  'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>',
                  'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is',
                  'rex', 'too', '<END>')

        storage = WordStorage()
        storage.update(corpus)

        encoded = encode_text(storage, corpus)

        trie = NGramTrie(2, encoded)

        generator = NGramTextGenerator(storage, trie)

        save_model(generator, 'my_awesome_model')

        with open('my_awesome_model', 'r', encoding='utf-8') as file_to_read:
            data = file_to_read.read()
        self.assertTrue(data)
コード例 #13
0
    def test_generate_next_word_incorrect_context(self):
        corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is',
                  'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>',
                  'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is',
                  'rex', 'too', '<END>')

        storage = WordStorage()
        storage.update(corpus)

        encoded = encode_text(storage, corpus)

        trie = NGramTrie(3, encoded)

        bad_inputs = [[], {}, (2000, ), None, 9, 9.34, True]

        generator = LikelihoodBasedTextGenerator(storage, trie)

        for bad_context in bad_inputs:
            self.assertRaises(ValueError, generator._generate_next_word,
                              bad_context)
コード例 #14
0
    def test_text_generator_generate_sentence_proper_number_of_end(self):
        """
        Checks that class creates correct sentence with only one <END>
        """
        corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is',
                  'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>',
                  'his', 'name', 'is', 'rex', '<END>', 'there', 'are', 'a',
                  'cat', 'outside', '<END>', 'here', 'is', 'a', 'cat',
                  'outside', '<END>')

        storage = WordStorage()
        storage.update(corpus)
        encoded = encode_text(storage, corpus)
        trie = NGramTrie(3, encoded)
        context = (storage.get_id('a'), storage.get_id('is'),
                   storage.get_id('<END>'))

        generator = NGramTextGenerator(storage, trie)
        actual = generator._generate_sentence(context)

        self.assertEqual(1, actual.count(storage.get_id('<END>')))
コード例 #15
0
    def test_float_result(self):
        """
            Checks that returned result is float
        """
        corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is',
                  'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>',
                  'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is',
                  'rex', 'too', '<END>')

        storage = WordStorage()
        storage.update(corpus)

        encoded = encode_text(storage, corpus)
        trie = NGramTrie(2, encoded)
        context = (storage.get_id('i'), )
        word = storage.get_id('have')

        generator = LikelihoodBasedTextGenerator(storage, trie)

        actual = generator._calculate_maximum_likelihood(word, context)
        self.assertEqual(float, type(actual))
    def test_ngram_text_generator_generate_next_word_no_such_context(self):
        """
        Checks that next word generates properly if no context found
        """
        corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is',
                  'bruno', '<END>')
        word_storage = WordStorage()
        word_storage.update(corpus)

        encoded = encode_text(word_storage, corpus)

        ngram = NGramTrie(3, encoded)

        generator = NGramTextGenerator(word_storage, ngram)
        context = (
            word_storage.get_id('i'),
            word_storage.get_id('name'),
        )  # there is no such context in ngrams, so return most frequent option
        expected_top_freq = word_storage.get_id('<END>')  # as it appears twice
        actual = generator._generate_next_word(context)
        self.assertEqual(expected_top_freq, actual)
    def test_ngram_text_generator_generate_sentence_ideal(self):
        """
        first and last generated words as expected
        """
        corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is',
                  'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>',
                  'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is',
                  'rex', 'too', '<END>')
        storage = WordStorage()
        storage.update(corpus)
        encoded = encode_text(storage, corpus)
        trie = NGramTrie(2, encoded)
        context = (storage.get_id('i'), )

        first_generated = storage.get_id('have')
        last_generated = storage.get_id('<END>')

        generator = NGramTextGenerator(storage, trie)
        actual = generator._generate_sentence(context)
        self.assertEqual(actual[1], first_generated)
        self.assertEqual(actual[-1], last_generated)
    def test_generate_text_ideal(self):
        """
        should generate simple case with three sentences out of small corpus
        """
        corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is',
                  'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>',
                  'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is',
                  'rex', 'too', '<END>')

        storage = WordStorage()
        storage.update(corpus)

        encoded = encode_text(storage, corpus)

        trie = NGramTrie(2, encoded)

        generator = NGramTextGenerator(storage, trie)

        context = (storage.get_id('bruno'), )
        end = storage.get_id('<END>')
        actual = generator.generate_text(context, 3)
        self.assertEqual(actual.count(end), 3)
コード例 #19
0
    def test_ngram_text_generator_end_at_the_beginning(self):
        """"
        should generate a sentence without <END> in any other position except the end of the sentence
        """
        corpus = ('i', 'like', 'to', 'read', '<END>', 'he', 'likes', 'to',
                  'read', 'too', 'i', 'like', 'a', 'book', 'called',
                  '"Harry Potter"', '<END>', 'he', 'likes', 'another', 'book',
                  '<END>', 'he', 'does', 'not', 'tell', 'me', 'name', '<END>')

        storage = WordStorage()
        storage.update(corpus)
        encoded = encode_text(storage, corpus)
        trie = NGramTrie(2, encoded)
        context = (storage.get_id('<END>'), )

        last_generated = storage.get_id('<END>')

        generator = NGramTextGenerator(storage, trie)
        actual = generator._generate_sentence(context)

        self.assertEqual(last_generated, actual[-1])
        self.assertEqual(1, actual.count(storage.get_id('<END>')))
コード例 #20
0
    def test_save_model_incorrect(self):
        """
        check for save_model function with incorrect inputs
        """
        corpus = ('i', 'have', 'a', 'cat', '<END>',
                  'his', 'name', 'is', 'bruno', '<END>',
                  'i', 'have', 'a', 'dog', 'too', '<END>',
                  'his', 'name', 'is', 'rex', '<END>',
                  'her', 'name', 'is', 'rex', 'too', '<END>')

        storage = WordStorage()
        storage.update(corpus)

        encoded = encode_text(storage, corpus)

        trie = NGramTrie(2, encoded)

        generator = NGramTextGenerator(storage, trie)

        bad_inputs = ((), [], {}, 123, None, WordStorage)

        for bad_input in bad_inputs:
            self.assertRaises(ValueError, save_model, generator, bad_input)
コード例 #21
0
 def test_ngram_trie_unigrams(self):
     """
     Checks that class creates correct unigrams
     """
     corpus = (1, 2, 3, 4, 5, 6, 7, 8, 9, 5, 1, 2, 3, 10, 11, 5, 6, 7, 8,
               12, 5, 13, 7, 8, 12, 11, 5)
     instance = NGramTrie(2, corpus)
     unigrams = {
         (1, ): 2,
         (2, ): 2,
         (3, ): 2,
         (4, ): 1,
         (5, ): 5,
         (6, ): 2,
         (7, ): 3,
         (8, ): 3,
         (9, ): 1,
         (10, ): 1,
         (11, ): 2,
         (12, ): 2,
         (13, ): 1
     }
     self.assertEqual(unigrams, instance.uni_grams)
    def test_generate_next_word_no_context(self):
        """
        Checks that next word generates even if context isn't found
        """
        corpus = ('he', 'likes', 'a', 'cat', 'but', 'he', 'does', 'not',
                  'like', 'parrots', '<END>', 'he', 'says', 'that', 'his',
                  'name', 'is', 'bruno', '<END>')
        storage = WordStorage()
        storage.update(corpus)

        encoded = encode_text(storage, corpus)

        trie = NGramTrie(3, encoded)
        context = (
            storage.get_id('cat'),
            storage.get_id('is'),
        )

        generator = LikelihoodBasedTextGenerator(storage, trie)

        expected_most_frequent = storage.get_id('he')
        actual = generator._generate_next_word(context)
        self.assertEqual(expected_most_frequent, actual)
コード例 #23
0
    def test_calculate_likelihood_incorrect_context(self):
        corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is',
                  'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>',
                  'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is',
                  'rex', 'too', '<END>')

        storage = WordStorage()
        storage.update(corpus)
        encoded = encode_text(storage, corpus)
        trie = NGramTrie(2, encoded)

        bad_inputs = [[], {}, (
            2000,
            1000,
        ), None, 9, 9.34, True]  # (2000, 1000, ) -> context for three gram
        word = storage.get_id('dog')

        generator = LikelihoodBasedTextGenerator(storage, trie)

        for bad_context in bad_inputs:
            self.assertRaises(ValueError,
                              generator._calculate_maximum_likelihood, word,
                              bad_context)
コード例 #24
0
    def test_calculate_likelihood_incorrect_word(self):
        corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is',
                  'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>',
                  'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is',
                  'rex', 'too', '<END>')

        storage = WordStorage()
        storage.update(corpus)
        encoded = encode_text(storage, corpus)
        trie = NGramTrie(2, encoded)

        bad_inputs = [(), [], None, 123]
        context = (
            storage.get_id('have'),
            storage.get_id('a'),
        )

        generator = LikelihoodBasedTextGenerator(storage, trie)

        for bad_word in bad_inputs:
            self.assertRaises(ValueError,
                              generator._calculate_maximum_likelihood,
                              bad_word, context)
コード例 #25
0
    def test_calculate_likelihood_ideal(self):
        corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is',
                  'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>',
                  'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is',
                  'rex', 'too', '<END>')

        storage = WordStorage()
        storage.update(corpus)

        encoded = encode_text(storage, corpus)

        trie = NGramTrie(3, encoded)
        word = storage.get_id('dog')
        context = (
            storage.get_id('have'),
            storage.get_id('a'),
        )

        generator = LikelihoodBasedTextGenerator(storage, trie)

        expected = 1 / 2
        actual = generator._calculate_maximum_likelihood(word, context)
        self.assertEqual(expected, actual)
コード例 #26
0
    def test_generate_next_word_same_prob(self):
        corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is',
                  'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>',
                  'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is',
                  'rex', 'too', '<END>')

        storage = WordStorage()
        storage.update(corpus)

        encoded = encode_text(storage, corpus)

        trie = NGramTrie(3, encoded)

        expected_word = storage.get_id('cat')
        context = (
            storage.get_id('have'),
            storage.get_id('a'),
        )

        generator = LikelihoodBasedTextGenerator(storage, trie)

        actual = generator._generate_next_word(context)
        self.assertEqual(expected_word, actual)
コード例 #27
0
    def test_text_generator_generate_sentence_includes_context(self):
        """
        Checks that class creates correct sentence which starts with context (if <END> not in context)
        """
        corpus = ('i', 'have', 'a', 'cat', 'and', 'a', 'dog', '<END>', 'his',
                  'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog',
                  'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'there',
                  'are', 'a', 'cat', 'and', 'a', 'bear', 'outside', '<END>',
                  'here', 'is', 'a', 'cat', 'outside', '<END>')

        storage = WordStorage()
        storage.update(corpus)

        encoded = encode_text(storage, corpus)

        trie = NGramTrie(3, encoded)

        generator = NGramTextGenerator(storage, trie)

        context = (storage.get_id('a'), storage.get_id('cat'))

        actual = generator._generate_sentence(context)

        self.assertEqual(context, actual[:len(context)])
コード例 #28
0
"""
Lab 4 implementation starter
"""

from lab_4.main import BackOffGenerator, encode_text, WordStorage, decode_text, tokenize_by_sentence
from lab_4.ngrams.ngram_trie import NGramTrie

if __name__ == '__main__':
    with open('lab_3/Frank_Baum.txt', 'r', encoding='utf-8') as file_frank:
        corpus = tokenize_by_sentence(file_frank.read())

    storage = WordStorage()
    storage.update(corpus)
    encoded = encode_text(storage, corpus)

    trie = NGramTrie(3, encoded)
    four = NGramTrie(4, encoded)

    context = (
        storage.get_id('when'),
        storage.get_id('the'),
    )

    generator = BackOffGenerator(storage, four, trie)
    generated_text = generator.generate_text(context, 5)
    RESULT = decode_text(storage, generated_text)

    # DO NOT REMOVE NEXT LINE - KEEP IT INTENTIONALLY LAST
    assert RESULT, 'Encoding not working'
コード例 #29
0

if __name__ == '__main__':

    TEXT = """I have a cat. His name is Max. I have a dog. His name is Jake.
              I do not have a dog or a cat. I have a parrot. Her name is Max too.
              And I have a parrot too. But his name is Leo."""

    corpus = lab_4.main.tokenize_by_sentence(TEXT)

    storage = lab_4.main.WordStorage()
    storage.update(corpus)

    encoded_text = lab_4.main.encode_text(storage, corpus)

    ngram_trie_2 = NGramTrie(2, encoded_text)
    ngram_trie_3 = NGramTrie(3, encoded_text)
    ngram_trie_4 = NGramTrie(4, encoded_text)

    expected_word = storage.get_id('a')
    CONTEXT = (storage.get_id('i'),
               storage.get_id('have'),)

    generator_ngram_trie = lab_4.main.NGramTextGenerator(storage, ngram_trie_3)
    generator_likelihood = lab_4.main.LikelihoodBasedTextGenerator(storage, ngram_trie_3)
    generator_backoff = lab_4.main.BackOffGenerator(storage, ngram_trie_3, ngram_trie_2, ngram_trie_4)

    generated_text_1 = lab_4.main.decode_text(storage, generator_ngram_trie.generate_text(CONTEXT, 3))
    generated_text_2 = lab_4.main.decode_text(storage, generator_likelihood.generate_text(CONTEXT, 3))
    generated_text_3 = lab_4.main.decode_text(storage, generator_backoff.generate_text(CONTEXT, 3))
コード例 #30
0
from lab_4.main import tokenize_by_sentence, WordStorage, encode_text, NGramTextGenerator
from lab_4.ngrams.ngram_trie import NGramTrie

if __name__ == '__main__':
    TEXT = 'I like flowers.\nMy mom likes flowers too.\nHer favourite flower is rose.\nMy favourite flower is rose too.'
    corpus = tokenize_by_sentence(TEXT)

    word_storage = WordStorage()
    word_storage.update(corpus)

    encoded_corpus = encode_text(word_storage, corpus)

    ngrams = NGramTrie(2, encoded_corpus)

    text_generator = NGramTextGenerator(word_storage, ngrams)
    gen_text = text_generator.generate_text((1, ), 2)

    end = word_storage.get_id('<END>')
    actual = gen_text.count(end)
    RESULT = 2
    print(actual)
    assert RESULT == actual, 'not working'