Esempio n. 1
0
    def test_generate_text_large_context(self):
        """
        should generate simple case with three sentences out of small corpus
        """
        corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is',
                  'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>',
                  'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is',
                  'rex', 'too', '<END>')

        storage = WordStorage()
        storage.update(corpus)

        encoded = encode_text(storage, corpus)

        trie = NGramTrie(5, encoded)

        generator = NGramTextGenerator(storage, trie)

        context = (
            storage.get_id('i'),
            storage.get_id('have'),
            storage.get_id('a'),
            storage.get_id('bruno'),
        )
        end = storage.get_id('<END>')
        actual = generator.generate_text(context, 3)
        self.assertEqual(actual.count(end), 3)
    def test_load_model_takes_less_time(self):
        with open('lab_1/data.txt', 'r', encoding='utf-8') as big_file:
            big_data = big_file.read()
        tokenized_data = tokenize_by_sentence(big_data)
        storage = WordStorage()
        storage.update(tokenized_data)
        context = (
            storage.get_id('despite'),
            storage.get_id('the'),
        )

        start_time_generate = time()
        encoded = encode_text(storage, tokenized_data)
        trie = NGramTrie(3, encoded)
        generator = NGramTextGenerator(storage, trie)
        generated_text = generator.generate_text(context, 3)
        end_time_generate = time() - start_time_generate
        save_model(generator, 'model_training')

        start_time_saved = time()
        loaded_model = load_model('model_training')
        new_result = loaded_model.generate_text(context, 3)
        end_time_saved = time() - start_time_saved

        self.assertGreater(end_time_generate, end_time_saved)
        self.assertEqual(generated_text, new_result)
Esempio n. 3
0
def realize_n_gram_text_generator(text):
    n_gram_storage = WordStorage()
    n_gram_storage.update(text)
    n_gram_context = (n_gram_storage.get_id('my'), n_gram_storage.get_id('dear'))
    n_gram_encoded = encode_text(n_gram_storage, text)
    n_gram_trie = NGramTrie(3, n_gram_encoded)
    n_gram_generator = NGramTextGenerator(n_gram_storage, n_gram_trie)
    n_gram_text_generated = n_gram_generator.generate_text(n_gram_context, 3)
    return decode_text(n_gram_storage, n_gram_text_generated)
Esempio n. 4
0
from lab_4.main import tokenize_by_sentence, WordStorage, encode_text, NGramTextGenerator
from lab_4.ngrams.ngram_trie import NGramTrie

if __name__ == '__main__':
    TEXT = 'I like flowers.\nMy mom likes flowers too.\nHer favourite flower is rose.\nMy favourite flower is rose too.'
    corpus = tokenize_by_sentence(TEXT)

    word_storage = WordStorage()
    word_storage.update(corpus)

    encoded_corpus = encode_text(word_storage, corpus)

    ngrams = NGramTrie(2, encoded_corpus)

    text_generator = NGramTextGenerator(word_storage, ngrams)
    gen_text = text_generator.generate_text((1, ), 2)

    end = word_storage.get_id('<END>')
    actual = gen_text.count(end)
    RESULT = 2
    print(actual)
    assert RESULT == actual, 'not working'
Esempio n. 5
0
from lab_4.main import NGramTextGenerator
from lab_4.main import LikelihoodBasedTextGenerator
from lab_4.main import BackOffGenerator

if __name__ == '__main__':
    corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno',
              '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name',
              'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>')
    storage = WordStorage()
    storage.update(corpus)
    encoded = encode_text(storage, corpus)
    trie = ngrams.NGramTrie(2, encoded)
    context = (storage.get_id('i'), )

    generator = NGramTextGenerator(storage, trie)
    actual = generator.generate_text(context, 5)
    actual = decode_text(storage, actual)
    print(actual)

    generator = LikelihoodBasedTextGenerator(storage, trie)
    actual = generator.generate_text(context, 5)
    actual = decode_text(storage, actual)
    print(actual)

    two = ngrams.NGramTrie(2, encoded)
    trie = ngrams.NGramTrie(3, encoded)

    context = (
        storage.get_id('name'),
        storage.get_id('is'),
    )
Esempio n. 6
0
"""
Lab 4 implementation starter
"""

from lab_4.main import tokenize_by_sentence, WordStorage, encode_text, NGramTextGenerator
from lab_4.ngrams.ngram_trie import NGramTrie

if __name__ == '__main__':
    text = 'I have a dog.\nHis name is Will'
    tokenize_text = tokenize_by_sentence(text)
    print(tokenize_text)

    storage = WordStorage()
    storage.update(tokenize_text)
    print(storage)

    encode = encode_text(storage, tokenize_text)
    print(encode)

    n_gram_trie = NGramTrie(2, encode)
    print(n_gram_trie)
    generator = NGramTextGenerator(storage, n_gram_trie)
    context = (storage.get_id('a'), )
    print(context)

    RESULT = generator.generate_text(context, 3)
    print(RESULT)
    # DO NOT REMOVE NEXT LINE - KEEP IT INTENTIONALLY LAST
    assert RESULT, 'Not working'
Esempio n. 7
0
from lab_4.main import tokenize_by_sentence
from lab_4.main import WordStorage, LikelihoodBasedTextGenerator
from lab_4.main import encode_text, decode_text
from lab_4.main import NGramTextGenerator

if __name__ == '__main__':
    TEXT = 'I have a cat. His name is Bruno. I have a dog too. His name is Rex. Her name is Rex too.'
    tokenized_text = tokenize_by_sentence(TEXT)

    word_storage = WordStorage()
    word_storage.update(tokenized_text)

    encoded = encode_text(word_storage, tokenized_text)

    trie = NGramTrie(3, encoded)
    context = (
        word_storage.get_id('name'),
        word_storage.get_id('is'),
    )

    generator = NGramTextGenerator(word_storage, trie)
    generated_text = generator.generate_text(context, 2)

    gen_likelihood = LikelihoodBasedTextGenerator(word_storage, trie)
    gen_text = gen_likelihood.generate_text(context, 2)
    decoded_text = decode_text(word_storage, gen_text)

    RESULT = decoded_text

    assert RESULT == ('Name is rex', 'Her name is rex'), "Not working"
Esempio n. 8
0
"""
Text generator implementation starter
"""

from lab_4.main import WordStorage, NGramTrie, NGramTextGenerator
from lab_4.main import tokenize_by_sentence, encode_text

if __name__ == '__main__':
    # here goes your function calls
    first_text = open('lab_3/Frank_Baum.txt', encoding="utf-8")
    first_text_tokenized = tokenize_by_sentence(first_text.read())

    word_storage = WordStorage()
    word_storage.update(first_text_tokenized)

    encoded = encode_text(word_storage, first_text_tokenized)

    n_gram_trie = NGramTrie(2, encoded)
    generator = NGramTextGenerator(word_storage, n_gram_trie)

    RESULT = generator.generate_text(encoded[16:17], 3)
    #print(RESULT)

    assert RESULT, "Not working"
Esempio n. 9
0
if __name__ == '__main__':

    text = tokenize_by_sentence(
        """Hi everyone! Nice to meet you again. What are you doing in my laboratory work?
                                    You are very nice person, do you know it? To be honest, I can't stand programming.
                                    But it doesn't depend on you! It's my personal problem and I don't know how to
                                    solve it... It doesn't matter right now""")

    word_storage = WordStorage()
    word_storage.update(text)

    encoded_text = encode_text(word_storage, text)

    n_gram_trie = NGramTrie(3, encoded_text)

    generator_of_text = NGramTextGenerator(word_storage, n_gram_trie)
    context = word_storage.get_id('on'), word_storage.get_id('you')

    formed_ids = generator_of_text.generate_text(context, 1)
    formed_text = []

    for ids in formed_ids:
        word = word_storage.get_word(ids)
        if word != '<END>':
            formed_text.append(word)

    RESULT = ' '.join(formed_text)
    print(RESULT)
    assert RESULT == 'on you', ''
Esempio n. 10
0
from lab_4.main import encode_text
from lab_4.main import NGramTextGenerator

if __name__ == '__main__':
    text = 'I have a cat. His name is Bruno. I have a dog too. ' \
           'His name is Rex. Her name is Rex too'
    corpus = tokenize_by_sentence(text)

    word_storage = WordStorage()
    word_storage.update(corpus)

    encoded_text = encode_text(word_storage, corpus)

    n_gram_trie = NGramTrie(2, encoded_text)

    n_gram_text_generator = NGramTextGenerator(word_storage, n_gram_trie)

    context = (word_storage.get_id('i'), word_storage.get_id('have'))

    text_generated = n_gram_text_generator.generate_text(context, 2)
    output_text = []

    for word_id in text_generated:
        word = word_storage.get_word(word_id)
        if word != '<END>':
            output_text.append(word)

    RESULT = ' '.join(output_text)
    print(RESULT)
    assert RESULT == 'i have a cat name is rex', 'Something went wrong :('
Esempio n. 11
0
import lab_4
from lab_4.main import WordStorage, tokenize_by_sentence, encode_text, NGramTextGenerator
from lab_4.ngrams.ngram_trie import NGramTrie

if __name__ == '__main__':
    text = 'I have a cat.\nHer name is Mila'
    tok_text = tokenize_by_sentence(text)
    storage = WordStorage()
    storage.update(tok_text)
    print(storage)
    encoded_text = encode_text(storage, tok_text)
    print(encoded_text)
    n_gram_trie = NGramTrie(2, encoded_text)
    print(n_gram_trie)
    gen = NGramTextGenerator(storage, n_gram_trie)
    context = (storage.get_id('a'), )
    print(context)
    RESULT = gen.generate_text(context, 2)
    print(RESULT)
    assert RESULT, 'Not working'
Esempio n. 12
0
from lab_4.ngrams.ngram_trie import NGramTrie
from lab_4.main import tokenize_by_sentence
from lab_4.main import WordStorage
from lab_4.main import encode_text
from lab_4.main import NGramTextGenerator

if __name__ == '__main__':
    text = "This is a dog. It likes running. This is a cat. It likes sleeping. Everyone likes sleeping too."
    text_in_tokens = tokenize_by_sentence(text)

    word_storage = WordStorage()
    word_storage.update(text_in_tokens)

    encoded_text = encode_text(word_storage, text_in_tokens)

    n_gram_trie = NGramTrie(2, encoded_text)
    context = (word_storage.get_id('likes'),)
    text_generator = NGramTextGenerator(word_storage, n_gram_trie)

    RESULT = text_generator.generate_text(context, 4)

    print(RESULT)

    assert RESULT, "Someting went worng.."