Esempio n. 1
0
    def test_decode_text_ideal_conditions(self):
        corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is',
                  'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>',
                  'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is',
                  'rex', 'too', '<END>')

        storage = WordStorage()
        storage.update(corpus)

        encoded = encode_text(storage, corpus)

        trie = NGramTrie(3, encoded)

        context = (
            storage.get_id('name'),
            storage.get_id('is'),
        )

        generator = LikelihoodBasedTextGenerator(storage, trie)

        to_decode = generator.generate_text(context, 2)
        actual = decode_text(storage, to_decode)

        for sentence in actual:
            self.assertTrue('<END>' not in sentence)
            self.assertTrue(sentence[0].isupper())
            self.assertTrue(sentence[-1].isalpha())
    def test_end(self):
        """
             Checks that after decoding no end in result
        """
        corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is',
                  'bruno', '<END>', 'i', 'have', 'a', 'cat', '<END>', 'his',
                  'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'cat',
                  '<END>')

        storage = WordStorage()
        storage.update(corpus)

        encoded = encode_text(storage, corpus)

        trie = NGramTrie(3, encoded)
        context = (
            storage.get_id('a'),
            storage.get_id('cat'),
        )

        generator = LikelihoodBasedTextGenerator(storage, trie)

        to_decode = generator.generate_text(context, 1)
        actual = decode_text(storage, to_decode)
        expected = ('A cat', )
        self.assertEqual(expected, actual)
Esempio n. 3
0
    def test_decode_text_ideal(self):
        corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is',
                  'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>',
                  'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is',
                  'rex', 'too', '<END>')

        storage = WordStorage()
        storage.update(corpus)

        encoded = encode_text(storage, corpus)

        trie = NGramTrie(3, encoded)

        context = (
            storage.get_id('name'),
            storage.get_id('is'),
        )
        print('Я ТЕСТ', context)
        end = storage.get_id('<END>')

        generator = LikelihoodBasedTextGenerator(storage, trie)

        to_decode = generator.generate_text(context, 2)
        self.assertEqual(to_decode[-1], end)

        expected = ('Name is rex', 'Her name is rex')
        actual = decode_text(storage, to_decode)
        self.assertEqual(expected, actual)
Esempio n. 4
0
def realize_n_gram_text_generator(text):
    n_gram_storage = WordStorage()
    n_gram_storage.update(text)
    n_gram_context = (n_gram_storage.get_id('my'), n_gram_storage.get_id('dear'))
    n_gram_encoded = encode_text(n_gram_storage, text)
    n_gram_trie = NGramTrie(3, n_gram_encoded)
    n_gram_generator = NGramTextGenerator(n_gram_storage, n_gram_trie)
    n_gram_text_generated = n_gram_generator.generate_text(n_gram_context, 3)
    return decode_text(n_gram_storage, n_gram_text_generated)
Esempio n. 5
0
def realize_likelihood_generator(text):
    likelihood_storage = WordStorage()
    likelihood_storage.update(text)
    context = (likelihood_storage.get_id('i'),
               likelihood_storage.get_id('shall'),)
    model = load_model('lab_4/likelihood_model.json')
    generator = LikelihoodBasedTextGenerator(model.word_storage, model.n_gram_trie)
    likelihood_text_generated = generator.generate_text(context, 3)

    return decode_text(likelihood_storage, likelihood_text_generated)
Esempio n. 6
0
 def test_decode_text_empty_sentence(self):
     """
     Tests that decode_corpus function
         can handle empty sentence input
     """
     word_storage = WordStorage()
     corpus = ()
     expected = ()
     word_storage.update(corpus)
     actual = decode_text(word_storage, corpus)
     self.assertEqual(expected, actual)
Esempio n. 7
0
def realize_backoff_generator(text):
    backoff_storage = WordStorage()
    backoff_storage.update(text)
    backoff_encoded = encode_text(backoff_storage, text)
    two = NGramTrie(2, backoff_encoded)
    trie = NGramTrie(3, backoff_encoded)
    backoff_context = (backoff_storage.get_id('if'),
                       backoff_storage.get_id('you'),)
    backoff_generator = BackOffGenerator(backoff_storage, trie, two)
    backoff_text_generated = backoff_generator.generate_text(backoff_context, 3)

    return decode_text(backoff_storage, backoff_text_generated)
Esempio n. 8
0
    def test_decode_text_upper_first_letter(self):
        '''
        Tests that number all the letters except
            first one in a sentence are in a lower case
        '''
        corpus = ('first', 'sentence', 'here', '<END>', 'second', 'sentence',
                  'here', '<END>', 'third', 'sentence', 'here', '<END>')

        storage = WordStorage()
        storage.update(corpus)

        encoded_text = encode_text(storage, corpus)
        trie = NGramTrie(3, encoded_text)
        context = (storage.get_id('first'), storage.get_id('sentence'))

        likelihood_generator = LikelihoodBasedTextGenerator(storage, trie)
        generated_encoded_text = likelihood_generator.generate_text(context, 1)
        decoded_text = decode_text(storage, generated_encoded_text)
        self.assertFalse(decoded_text[0][1:].isupper())
Esempio n. 9
0
"""
Lab 4 implementation starter
"""

from lab_4.main import BackOffGenerator, encode_text, WordStorage, decode_text, tokenize_by_sentence
from lab_4.ngrams.ngram_trie import NGramTrie

if __name__ == '__main__':
    with open('lab_3/Frank_Baum.txt', 'r', encoding='utf-8') as file_frank:
        corpus = tokenize_by_sentence(file_frank.read())

    storage = WordStorage()
    storage.update(corpus)
    encoded = encode_text(storage, corpus)

    trie = NGramTrie(3, encoded)
    four = NGramTrie(4, encoded)

    context = (
        storage.get_id('when'),
        storage.get_id('the'),
    )

    generator = BackOffGenerator(storage, four, trie)
    generated_text = generator.generate_text(context, 5)
    RESULT = decode_text(storage, generated_text)

    # DO NOT REMOVE NEXT LINE - KEEP IT INTENTIONALLY LAST
    assert RESULT, 'Encoding not working'
Esempio n. 10
0
from lab_4.main import LikelihoodBasedTextGenerator
from lab_4.main import BackOffGenerator

if __name__ == '__main__':
    corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno',
              '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name',
              'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>')
    storage = WordStorage()
    storage.update(corpus)
    encoded = encode_text(storage, corpus)
    trie = ngrams.NGramTrie(2, encoded)
    context = (storage.get_id('i'), )

    generator = NGramTextGenerator(storage, trie)
    actual = generator.generate_text(context, 5)
    actual = decode_text(storage, actual)
    print(actual)

    generator = LikelihoodBasedTextGenerator(storage, trie)
    actual = generator.generate_text(context, 5)
    actual = decode_text(storage, actual)
    print(actual)

    two = ngrams.NGramTrie(2, encoded)
    trie = ngrams.NGramTrie(3, encoded)

    context = (
        storage.get_id('name'),
        storage.get_id('is'),
    )
Esempio n. 11
0
"""

from lab_4.main import LikelihoodBasedTextGenerator, encode_text, WordStorage, decode_text
from lab_4.ngrams.ngram_trie import NGramTrie

if __name__ == '__main__':
    corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno',
              '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name',
              'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>')

    storage = WordStorage()
    storage.update(corpus)

    encoded = encode_text(storage, corpus)

    trie = NGramTrie(3, encoded)

    context = (
        storage.get_id('name'),
        storage.get_id('is'),
    )
    end = storage.get_id('<END>')

    generator = LikelihoodBasedTextGenerator(storage, trie)

    to_decode = generator.generate_text(context, 2)

    EXPECTED = ('Name is rex', 'Her name is rex')
    RESULT = decode_text(storage, to_decode)
    assert RESULT == EXPECTED, 'Encoding not working'
Esempio n. 12
0
from lab_4.main import tokenize_by_sentence
from lab_4.main import WordStorage, LikelihoodBasedTextGenerator
from lab_4.main import encode_text, decode_text
from lab_4.main import NGramTextGenerator

if __name__ == '__main__':
    TEXT = 'I have a cat. His name is Bruno. I have a dog too. His name is Rex. Her name is Rex too.'
    tokenized_text = tokenize_by_sentence(TEXT)

    word_storage = WordStorage()
    word_storage.update(tokenized_text)

    encoded = encode_text(word_storage, tokenized_text)

    trie = NGramTrie(3, encoded)
    context = (
        word_storage.get_id('name'),
        word_storage.get_id('is'),
    )

    generator = NGramTextGenerator(word_storage, trie)
    generated_text = generator.generate_text(context, 2)

    gen_likelihood = LikelihoodBasedTextGenerator(word_storage, trie)
    gen_text = gen_likelihood.generate_text(context, 2)
    decoded_text = decode_text(word_storage, gen_text)

    RESULT = decoded_text

    assert RESULT == ('Name is rex', 'Her name is rex'), "Not working"
Esempio n. 13
0
              'too', '<END>')

    storage = WordStorage()
    storage.update(corpus)

    encoded = encode_text(storage, corpus)

    trie = NGramTrie(4, encoded)

    context = (storage.get_id('i'), storage.get_id('have'),
               storage.get_id('a'))

    generator_likelihood = LikelihoodBasedTextGenerator(storage, trie)

    generated_text = generator_likelihood.generate_text(context, 3)
    decoded_gen_text = decode_text(storage, generated_text)
    print('Likelihood generator generates sentences:')
    print(*decoded_gen_text, sep='. ', end='.\n')

    two = NGramTrie(2, encoded)
    trie = NGramTrie(3, encoded)

    context = (
        storage.get_id('i'),
        storage.get_id('have'),
    )

    generator_backoff = BackOffGenerator(storage, trie, two)

    actual = generator_backoff.generate_text(context, 3)
    RESULT = decode_text(storage, actual)
"""
Lab 4 starter
"""
from lab_4.main import BackOffGenerator, encode_text, WordStorage, decode_text
from lab_4.ngrams.ngram_trie import NGramTrie

if __name__ == '__main__':
    corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno',
              '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name',
              'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>')
    storage = WordStorage()
    storage.update(corpus)
    encoded_text = encode_text(storage, corpus)
    trie = NGramTrie(3, encoded_text)
    four = NGramTrie(4, encoded_text)
    context = (
        storage.get_id('his'),
        storage.get_id('name'),
        storage.get_id('is'),
    )
    generator = BackOffGenerator(storage, trie, four)

    text = generator.generate_text(context, 3)
    actual = decode_text(storage, text)
    RESULT = ('His name is bruno', 'I have a cat', 'His name is bruno')
    assert RESULT == actual, 'Not work'
Esempio n. 15
0
    cannot have egg bacon spam and sausage without the spam. I do not like spam! Sshh, dear, do not cause a fuss.
    I will have your spam. I love it. I am having spam beaked beans spam and spam! Lovely spam! Wonderful spam! 
    Shut up! Baked beans are off. Well could I have her spam instead of the baked beans then?'''

    corpus = tokenize_by_sentence(TEXT)

    storage = WordStorage()
    storage.update(corpus)

    encoded_text = encode_text(storage, corpus)

    n_gram_trie = NGramTrie(3, encoded_text)

    generator = LikelihoodBasedTextGenerator(storage, n_gram_trie)

    context = (storage.get_id('bloody'), storage.get_id('vikings'))
    generated_text = generator.generate_text(context, 5)

    decoded_text = decode_text(storage, generated_text)

    IS_WORKING = True
    for sentence in decoded_text:
        if '<END>' in sentence or not sentence[0].isupper(
        ) or not sentence[-1].isalpha():
            IS_WORKING = False

    print(decoded_text)

    RESULT = IS_WORKING
    assert RESULT, 'Not working'