Exemple #1
0
def main():
    text = ('I have a cat. His name is Bruno. '
            'I have a dog too. His name is Rex. '
            'Her name is Rex too.')

    corpus = tokenize_by_sentence(text)

    storage = WordStorage()
    storage.update(corpus)

    encoded = encode_text(storage, corpus)

    two = NGramTrie(2, encoded)
    trie = NGramTrie(3, encoded)

    context = (
        storage.get_id('name'),
        storage.get_id('is'),
    )

    generator = BackOffGenerator(storage, trie, two)

    expected = 'rex'
    actual = storage.get_word(generator._generate_next_word(context))

    print(f'TEXT:\n{text}')
    print(f'\nEXPECTED WORD AFTER name is IS {expected}')
    print(f'ACTUAL WORD AFTER name is IS {actual}')

    save_model(generator, 'model.txt')
    load_model('model.txt')

    return actual == expected
Exemple #2
0
    def test_most_freq_word_end(self):
        corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is',
                  'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>',
                  'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is',
                  'rex', 'too', '<END>')

        storage = WordStorage()
        storage.update(corpus)

        encoded = encode_text(storage, corpus)

        five = NGramTrie(5, encoded)
        trie = NGramTrie(3, encoded)
        four = NGramTrie(4, encoded)

        expected_word = storage.get_id('<END>')
        context = (
            storage.get_id('his'),
            storage.get_id('name'),
            storage.get_id('is'),
            storage.get_id('bruno'),
        )

        generator = BackOffGenerator(storage, five, trie, four)

        actual = generator.most_freq_word(context)
        self.assertEqual(expected_word, actual)
Exemple #3
0
    def test_generate_next_word_context_incorrect(self):
        corpus = ('i', 'have', 'a', 'cat', '<END>',
                  'his', 'name', 'is', 'bruno', '<END>',
                  'i', 'have', 'a', 'dog', 'too', '<END>',
                  'his', 'name', 'is', 'rex', '<END>',
                  'her', 'name', 'is', 'rex', 'too', '<END>')

        storage = WordStorage()
        storage.update(corpus)

        encoded = encode_text(storage, corpus)

        four = NGramTrie(4, encoded)
        trie = NGramTrie(3, encoded)
        two = NGramTrie(2, encoded)

        expected_word = storage.get_id('rex')
        context = (storage.get_id('name'),
                   storage.get_id('is'),
                   storage.get_id('cat'))

        generator = BackOffGenerator(storage, four, two, trie)

        actual = generator._generate_next_word(context)
        self.assertEqual(expected_word, actual)
    def test_text_generator_no_context(self):
        """
        checks if the program can generate sentences without given context
        """

        corpus = ('cat', 'has', 'paws', '<END>', 'dogs', 'have', 'noses',
                  '<END>', 'cat', 'has', 'whiskers', '<END>')
        storage = WordStorage()
        storage.update(corpus)

        encoded = encode_text(storage, corpus)

        trie = NGramTrie(3, encoded)
        two = NGramTrie(2, encoded)
        four = NGramTrie(4, encoded)

        context = (
            storage.get_id('cat'),
            storage.get_id('dogs'),
        )

        generator = BackOffGenerator(storage, trie, two, four)

        actual = generator.generate_text(context, 3)
        self.assertTrue(all(actual))
Exemple #5
0
def realize_backoff_generator(text):
    backoff_storage = WordStorage()
    backoff_storage.update(text)
    backoff_encoded = encode_text(backoff_storage, text)
    two = NGramTrie(2, backoff_encoded)
    trie = NGramTrie(3, backoff_encoded)
    backoff_context = (backoff_storage.get_id('if'),
                       backoff_storage.get_id('you'),)
    backoff_generator = BackOffGenerator(backoff_storage, trie, two)
    backoff_text_generated = backoff_generator.generate_text(backoff_context, 3)

    return decode_text(backoff_storage, backoff_text_generated)
Exemple #6
0
    def test_most_freq_word_incorrect_context(self):
        corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is',
                  'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>',
                  'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is',
                  'rex', 'too', '<END>')

        storage = WordStorage()
        storage.update(corpus)

        encoded = encode_text(storage, corpus)

        trie = NGramTrie(3, encoded)
        two = NGramTrie(2, encoded)
        four = NGramTrie(4, encoded)

        bad_inputs = [[], {}, (
            2000,
            1000,
        ), None, 9, 9.34, True]

        generator = BackOffGenerator(storage, trie, two, four)

        for bad_context in bad_inputs:
            self.assertRaises(ValueError, generator.most_freq_word,
                              bad_context)
Exemple #7
0
    def test_backoff_generator_instance_creation(self):
        """
        Checks that class creates correct instance
        """
        word_storage = WordStorage()
        ngram = NGramTrie(2, ())

        generator = BackOffGenerator(word_storage, ngram)
        self.assertEqual(generator._word_storage, word_storage)
        self.assertTrue(ngram in generator._n_gram_tries)
Exemple #8
0
    def test_generate_next_word_short_context(self):
        corpus = ('bye', '<END>')

        storage = WordStorage()
        storage.update(corpus)

        encoded = encode_text(storage, corpus)

        four = NGramTrie(4, encoded)
        trie = NGramTrie(3, encoded)
        two = NGramTrie(2, encoded)

        expected_word = storage.get_id('bye')
        context = (storage.get_id('<END>'),)

        generator = BackOffGenerator(storage, two, four, trie)

        actual = generator._generate_next_word(context)
        self.assertEqual(expected_word, actual)
Exemple #9
0
    def test_backoff_generator_instance_creation_complex(self):
        """
        Checks that class creates correct instance with several tries
        """
        word_storage = WordStorage()
        ngram = NGramTrie(2, ())
        three = NGramTrie(3, ())
        four = NGramTrie(4, ())

        generator = BackOffGenerator(word_storage, ngram, three, four)
        self.assertEqual(generator._word_storage, word_storage)
        self.assertTrue(ngram in generator._n_gram_tries)
        self.assertTrue(three in generator._n_gram_tries)
        self.assertTrue(four in generator._n_gram_tries)
Exemple #10
0
    def test_generate_next_word_no_context(self):
        corpus = ('i', 'watch', 'a', 'horror', 'movie', '<END>',
                  'would', 'you', 'like', 'to', 'watch' 'with', 'me', '<END>',
                  'i', 'do', 'not', 'like', 'such', 'films', '<END>',
                  'i', 'like', 'to', 'watch', 'drama', 'movies', '<END>',
                  'bye')

        storage = WordStorage()
        storage.update(corpus)

        encoded = encode_text(storage, corpus)

        four = NGramTrie(4, encoded)
        trie = NGramTrie(3, encoded)
        two = NGramTrie(2, encoded)

        expected_word = storage.get_id('<END>')
        context = (storage.get_id('bye'),)

        generator = BackOffGenerator(storage, two, four, trie)

        actual = generator._generate_next_word(context)
        self.assertEqual(expected_word, actual)
"""
Lab 4 implementation starter
"""

from lab_4.main import BackOffGenerator, encode_text, WordStorage, decode_text, tokenize_by_sentence
from lab_4.ngrams.ngram_trie import NGramTrie

if __name__ == '__main__':
    with open('lab_3/Frank_Baum.txt', 'r', encoding='utf-8') as file_frank:
        corpus = tokenize_by_sentence(file_frank.read())

    storage = WordStorage()
    storage.update(corpus)
    encoded = encode_text(storage, corpus)

    trie = NGramTrie(3, encoded)
    four = NGramTrie(4, encoded)

    context = (
        storage.get_id('when'),
        storage.get_id('the'),
    )

    generator = BackOffGenerator(storage, four, trie)
    generated_text = generator.generate_text(context, 5)
    RESULT = decode_text(storage, generated_text)

    # DO NOT REMOVE NEXT LINE - KEEP IT INTENTIONALLY LAST
    assert RESULT, 'Encoding not working'
Exemple #12
0
    storage.update(corpus)
    encoded = encode_text(storage, corpus)
    trie = ngrams.NGramTrie(2, encoded)
    context = (storage.get_id('i'), )

    generator = NGramTextGenerator(storage, trie)
    actual = generator.generate_text(context, 5)
    actual = decode_text(storage, actual)
    print(actual)

    generator = LikelihoodBasedTextGenerator(storage, trie)
    actual = generator.generate_text(context, 5)
    actual = decode_text(storage, actual)
    print(actual)

    two = ngrams.NGramTrie(2, encoded)
    trie = ngrams.NGramTrie(3, encoded)

    context = (
        storage.get_id('name'),
        storage.get_id('is'),
    )

    generator = BackOffGenerator(storage, trie, two)

    actual = generator.generate_text(context, 5)
    RESULT = decode_text(storage, actual)
    print(RESULT)
    assert RESULT == ('Name is rex', 'Her name is rex', 'Her name is rex',
                      'Her name is rex', 'Her name is rex')
    trie = NGramTrie(4, encoded)

    context = (storage.get_id('i'), storage.get_id('have'),
               storage.get_id('a'))

    generator_likelihood = LikelihoodBasedTextGenerator(storage, trie)

    generated_text = generator_likelihood.generate_text(context, 3)
    decoded_gen_text = decode_text(storage, generated_text)
    print('Likelihood generator generates sentences:')
    print(*decoded_gen_text, sep='. ', end='.\n')

    two = NGramTrie(2, encoded)
    trie = NGramTrie(3, encoded)

    context = (
        storage.get_id('i'),
        storage.get_id('have'),
    )

    generator_backoff = BackOffGenerator(storage, trie, two)

    actual = generator_backoff.generate_text(context, 3)
    RESULT = decode_text(storage, actual)
    print('Backoff generator generates sentences:')
    print(*RESULT, sep='. ', end='.\n')

    assert RESULT == (
        'I have a colourful dog', 'I havent a cat too',
        'They have beautiful dogs'), 'Text generator does not work'
"""
Lab 4 starter
"""
from lab_4.main import BackOffGenerator, encode_text, WordStorage, decode_text
from lab_4.ngrams.ngram_trie import NGramTrie

if __name__ == '__main__':
    corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno',
              '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name',
              'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>')
    storage = WordStorage()
    storage.update(corpus)
    encoded_text = encode_text(storage, corpus)
    trie = NGramTrie(3, encoded_text)
    four = NGramTrie(4, encoded_text)
    context = (
        storage.get_id('his'),
        storage.get_id('name'),
        storage.get_id('is'),
    )
    generator = BackOffGenerator(storage, trie, four)

    text = generator.generate_text(context, 3)
    actual = decode_text(storage, text)
    RESULT = ('His name is bruno', 'I have a cat', 'His name is bruno')
    assert RESULT == actual, 'Not work'