Exemple #1
0
    def test_ngram_text_generator_instance_creation(self):
        """
        Checks that class creates correct instance
        """
        word_storage = WordStorage()
        ngram = NGramTrie(2, ())

        generator = NGramTextGenerator(word_storage, ngram)
        self.assertEqual(generator._word_storage, word_storage)
        self.assertEqual(generator._n_gram_trie, ngram)
    def test_ngram_text_generator_generate_sentence_no_end(self):
        """
        should generate '<END>' anyway
        """
        corpus = ('i', 'have', 'a', 'cat', 'his', 'name', 'is', 'bruno', 'i',
                  'have', 'a', 'dog', 'too', 'his', 'name', 'is', 'rex', 'her',
                  'name', 'is', 'rex', 'too', '<END>')
        word_storage = WordStorage()
        word_storage.update(corpus)
        encoded = encode_text(word_storage, corpus)
        trie = NGramTrie(2, encoded)
        context = (word_storage.get_id('cat'), )

        generator = NGramTextGenerator(word_storage, trie)
        actual = generator._generate_sentence(context)

        expected = '<END>'
        actual = word_storage.get_word(actual[-1])
        self.assertEqual(expected, actual)
Exemple #3
0
    def test_get_most_frequent_gram_no_such_context(self):
        """
        Checks that returns empty tuple with no context in the corpus
        """
        corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is',
                  'bruno', '<END>')
        word_storage = WordStorage()
        word_storage.update(corpus)
        encoded = encode_text(word_storage, corpus)
        ngram = NGramTrie(3, encoded)
        generator = NGramTextGenerator(word_storage, ngram)

        context = (
            word_storage.get_id('i'),
            word_storage.get_id('name'),
        )  # there is no such context in ngrams
        expected = ()
        actual = generator.get_most_frequent_gram(context)
        self.assertEqual(expected, actual)
Exemple #4
0
    def test_text_generator_generate_sentence_proper_number_of_end(self):
        """
        Checks that class creates correct sentence with only one <END>
        """
        corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is',
                  'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>',
                  'his', 'name', 'is', 'rex', '<END>', 'there', 'are', 'a',
                  'cat', 'outside', '<END>', 'here', 'is', 'a', 'cat',
                  'outside', '<END>')

        storage = WordStorage()
        storage.update(corpus)
        encoded = encode_text(storage, corpus)
        trie = NGramTrie(3, encoded)
        context = (storage.get_id('a'), storage.get_id('is'),
                   storage.get_id('<END>'))

        generator = NGramTextGenerator(storage, trie)
        actual = generator._generate_sentence(context)

        self.assertEqual(1, actual.count(storage.get_id('<END>')))
    def test_ngram_text_generator_generate_next_word_no_such_context(self):
        """
        Checks that next word generates properly if no context found
        """
        corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is',
                  'bruno', '<END>')
        word_storage = WordStorage()
        word_storage.update(corpus)

        encoded = encode_text(word_storage, corpus)

        ngram = NGramTrie(3, encoded)

        generator = NGramTextGenerator(word_storage, ngram)
        context = (
            word_storage.get_id('i'),
            word_storage.get_id('name'),
        )  # there is no such context in ngrams, so return most frequent option
        expected_top_freq = word_storage.get_id('<END>')  # as it appears twice
        actual = generator._generate_next_word(context)
        self.assertEqual(expected_top_freq, actual)
    def test_ngram_text_generator_generate_sentence_ideal(self):
        """
        first and last generated words as expected
        """
        corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is',
                  'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>',
                  'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is',
                  'rex', 'too', '<END>')
        storage = WordStorage()
        storage.update(corpus)
        encoded = encode_text(storage, corpus)
        trie = NGramTrie(2, encoded)
        context = (storage.get_id('i'), )

        first_generated = storage.get_id('have')
        last_generated = storage.get_id('<END>')

        generator = NGramTextGenerator(storage, trie)
        actual = generator._generate_sentence(context)
        self.assertEqual(actual[1], first_generated)
        self.assertEqual(actual[-1], last_generated)
    def test_generate_text_ideal(self):
        """
        should generate simple case with three sentences out of small corpus
        """
        corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is',
                  'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>',
                  'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is',
                  'rex', 'too', '<END>')

        storage = WordStorage()
        storage.update(corpus)

        encoded = encode_text(storage, corpus)

        trie = NGramTrie(2, encoded)

        generator = NGramTextGenerator(storage, trie)

        context = (storage.get_id('bruno'), )
        end = storage.get_id('<END>')
        actual = generator.generate_text(context, 3)
        self.assertEqual(actual.count(end), 3)
Exemple #8
0
    def test_ngram_text_generator_end_at_the_beginning(self):
        """"
        should generate a sentence without <END> in any other position except the end of the sentence
        """
        corpus = ('i', 'like', 'to', 'read', '<END>', 'he', 'likes', 'to',
                  'read', 'too', 'i', 'like', 'a', 'book', 'called',
                  '"Harry Potter"', '<END>', 'he', 'likes', 'another', 'book',
                  '<END>', 'he', 'does', 'not', 'tell', 'me', 'name', '<END>')

        storage = WordStorage()
        storage.update(corpus)
        encoded = encode_text(storage, corpus)
        trie = NGramTrie(2, encoded)
        context = (storage.get_id('<END>'), )

        last_generated = storage.get_id('<END>')

        generator = NGramTextGenerator(storage, trie)
        actual = generator._generate_sentence(context)

        self.assertEqual(last_generated, actual[-1])
        self.assertEqual(1, actual.count(storage.get_id('<END>')))
    def test_save_model_incorrect_path(self):
        corpus = ('i', 'have', 'a', 'cat', '<END>',
                  'his', 'name', 'is', 'bruno', '<END>')

        storage = WordStorage()
        storage.update(corpus)

        encoded = encode_text(storage, corpus)

        trie = NGramTrie(2, encoded)

        generator = NGramTextGenerator(storage, trie)

        self.assertRaises(FileNotFoundError, save_model, generator, r'some_folder/some_file')
    def test_context_end(self):
        """
        checks if <END> is in the context
        """
        context = ('cat', '<END>')
        corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is',
                  'bruno', '<END>')
        word_storage = WordStorage()
        word_storage.update(corpus)
        encoded = encode_text(word_storage, corpus)
        trie = NGramTrie(3, encoded)
        generator = NGramTextGenerator(word_storage, trie)

        self.assertRaises(ValueError, generator._generate_sentence, context)
Exemple #11
0
    def test_text_generator_generate_sentence_includes_context(self):
        """
        Checks that class creates correct sentence which starts with context (if <END> not in context)
        """
        corpus = ('i', 'have', 'a', 'cat', 'and', 'a', 'dog', '<END>', 'his',
                  'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog',
                  'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'there',
                  'are', 'a', 'cat', 'and', 'a', 'bear', 'outside', '<END>',
                  'here', 'is', 'a', 'cat', 'outside', '<END>')

        storage = WordStorage()
        storage.update(corpus)

        encoded = encode_text(storage, corpus)

        trie = NGramTrie(3, encoded)

        generator = NGramTextGenerator(storage, trie)

        context = (storage.get_id('a'), storage.get_id('cat'))

        actual = generator._generate_sentence(context)

        self.assertEqual(context, actual[:len(context)])
    def test_text_generator_throws_errors(self):
        """
        throws errors with bad inputs
        """
        bad_inputs = [[], {}, None, 9, 9.34, True]
        corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is',
                  'bruno', '<END>')
        word_storage = WordStorage()
        word_storage.update(corpus)
        encoded = encode_text(word_storage, corpus)
        trie = NGramTrie(2, encoded)
        generator = NGramTextGenerator(word_storage, trie)

        for bad_input in bad_inputs:
            self.assertRaises(ValueError, generator.generate_text, bad_input,
                              10)
Exemple #13
0
    def test_ngram_text_generator_bad_num_input(self):  # new test
        """
        throws errors with bad inputs
        """
        bad_inputs = (-5, 0, -2, -1, -8)
        corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is',
                  'bruno', '<END>')
        word_storage = WordStorage()
        word_storage.update(corpus)
        encoded = encode_text(word_storage, corpus)
        trie = NGramTrie(2, encoded)
        generator = NGramTextGenerator(word_storage, trie)

        for bad_input in bad_inputs:
            self.assertRaises(ValueError, generator._generate_sentence,
                              bad_input)
    def test_ngram_text_generator_generate_next_word_incorrect_context(self):
        """
        Checks that method throws error
        """
        corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is',
                  'bruno', '<END>')
        word_storage = WordStorage()
        word_storage.update(corpus)
        encoded = encode_text(word_storage, corpus)
        ngram = NGramTrie(3, encoded)

        generator = NGramTextGenerator(word_storage, ngram)
        bad_inputs = [[], {}, (3, ), None, 9, 9.34,
                      True]  # (3, ) - it is incorrect sized ngram
        for bad_input in bad_inputs:
            self.assertRaises(ValueError, generator._generate_next_word,
                              bad_input)
    def test_load_model_has_generator_methods(self):
        corpus = ('i', 'have', 'a', 'cat', '<END>',
                  'his', 'name', 'is', 'bruno', '<END>',
                  'i', 'have', 'a', 'dog', 'too', '<END>',
                  'his', 'name', 'is', 'rex', '<END>',
                  'her', 'name', 'is', 'rex', 'too', '<END>')

        storage = WordStorage()
        storage.update(corpus)

        encoded = encode_text(storage, corpus)

        trie = NGramTrie(2, encoded)

        generator = NGramTextGenerator(storage, trie)

        save_model(generator, 'my_awesome_model')
        loaded_model = load_model('my_awesome_model')

        self.assertEquals(dir(loaded_model), dir(generator))
    def test_save_model_ideal(self):
        corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is',
                  'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>',
                  'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is',
                  'rex', 'too', '<END>')

        storage = WordStorage()
        storage.update(corpus)

        encoded = encode_text(storage, corpus)

        trie = NGramTrie(2, encoded)

        generator = NGramTextGenerator(storage, trie)

        save_model(generator, 'my_awesome_model')

        with open('my_awesome_model', 'r', encoding='utf-8') as file_to_read:
            data = file_to_read.read()
        self.assertTrue(data)
Exemple #17
0
    def test_save_model_incorrect(self):
        """
        check for save_model function with incorrect inputs
        """
        corpus = ('i', 'have', 'a', 'cat', '<END>',
                  'his', 'name', 'is', 'bruno', '<END>',
                  'i', 'have', 'a', 'dog', 'too', '<END>',
                  'his', 'name', 'is', 'rex', '<END>',
                  'her', 'name', 'is', 'rex', 'too', '<END>')

        storage = WordStorage()
        storage.update(corpus)

        encoded = encode_text(storage, corpus)

        trie = NGramTrie(2, encoded)

        generator = NGramTextGenerator(storage, trie)

        bad_inputs = ((), [], {}, 123, None, WordStorage)

        for bad_input in bad_inputs:
            self.assertRaises(ValueError, save_model, generator, bad_input)
Exemple #18
0
"""
Text generator implementation starter
"""

from lab_4.main import WordStorage, NGramTrie, NGramTextGenerator
from lab_4.main import tokenize_by_sentence, encode_text

if __name__ == '__main__':
    # here goes your function calls
    first_text = open('lab_3/Frank_Baum.txt', encoding="utf-8")
    first_text_tokenized = tokenize_by_sentence(first_text.read())

    word_storage = WordStorage()
    word_storage.update(first_text_tokenized)

    encoded = encode_text(word_storage, first_text_tokenized)

    n_gram_trie = NGramTrie(2, encoded)
    generator = NGramTextGenerator(word_storage, n_gram_trie)

    RESULT = generator.generate_text(encoded[16:17], 3)
    #print(RESULT)

    assert RESULT, "Not working"
Exemple #19
0
"""
Lab 4 implementation starter
"""

from lab_4.main import WordStorage, NGramTrie, NGramTextGenerator
from lab_4.main import tokenize_by_sentence, encode_text

if __name__ == '__main__':
    text = 'She has a house. He has a house too. Besides he has a car. My friend also he has a car. ' \
           'Seems like everyone has has a car, but me.'

    text_tokenized = tokenize_by_sentence(text)

    word_storage = WordStorage()
    word_storage.update(text_tokenized)

    encoded_text = encode_text(word_storage, text_tokenized)
    print(encoded_text)

    trie = NGramTrie(2, encoded_text)
    context = (word_storage.get_id('has'),)
    generator = NGramTextGenerator(word_storage, trie)

    actual = generator.generate_text(context, 4)

    RESULT = actual
    print(RESULT)
    # DO NOT REMOVE NEXT LINE - KEEP IT INTENTIONALLY LAST
    assert RESULT, "Something went wrong"
Exemple #20
0
from lab_4.main import encode_text
from lab_4.main import NGramTextGenerator

if __name__ == '__main__':
    text = 'I have a cat. His name is Bruno. I have a dog too. ' \
           'His name is Rex. Her name is Rex too'
    corpus = tokenize_by_sentence(text)

    word_storage = WordStorage()
    word_storage.update(corpus)

    encoded_text = encode_text(word_storage, corpus)

    n_gram_trie = NGramTrie(2, encoded_text)

    n_gram_text_generator = NGramTextGenerator(word_storage, n_gram_trie)

    context = (word_storage.get_id('i'), word_storage.get_id('have'))

    text_generated = n_gram_text_generator.generate_text(context, 2)
    output_text = []

    for word_id in text_generated:
        word = word_storage.get_word(word_id)
        if word != '<END>':
            output_text.append(word)

    RESULT = ' '.join(output_text)
    print(RESULT)
    assert RESULT == 'i have a cat name is rex', 'Something went wrong :('
"""
Text generator implementation starter
"""

from lab_4.main import WordStorage, NGramTrie, NGramTextGenerator
from lab_4.main import tokenize_by_sentence, encode_text

if __name__ == '__main__':
    text = 'I have a cat. His name is Bruno. I have a dog. Her name is Rex. Her name is Rex too.'
    text_tokenized = tokenize_by_sentence(text)

    word_storage = WordStorage()
    word_storage.update(text_tokenized)

    encoded = encode_text(word_storage, text_tokenized)

    trie = NGramTrie(2, encoded)
    context = (word_storage.get_id('i'), )
    generator = NGramTextGenerator(word_storage, trie)

    RESULT = generator.generate_text(context, 4)

    print(RESULT)

    assert RESULT, "Not working"
Exemple #22
0
    On Monday, I go to work. 
    I work at the post office. 
    Everyone shops for food at the grocery store. 
    They also eat at the restaurant. 
    The restaurant serves pizza and ice cream.
    My friends and I go to the park. 
    We like to play soccer at the park. 
    On Fridays, we go to the cinema to see a movie. 
    Children don't go to school on the weekend. 
    Each day, people go to the hospital when they are sick. 
    The doctors and nurses take care of them in the city. 
    The police keep everyone safe. I am happy to live in my city.
    """

    corpus = tokenize_by_sentence(TEXT)

    word_storage = WordStorage()
    word_storage.update(corpus)

    encoded_corpus = encode_text(word_storage, corpus)

    ngrams = NGramTrie(3, encoded_corpus)

    generator = NGramTextGenerator(word_storage, ngrams)

    context = (word_storage.get_id('the'), word_storage.get_id('post'))

    RESULT = generator.generate_text(context, 1)
    print(' '.join([word_storage.get_word(word) for word in RESULT]))
    assert RESULT, 'Language generator work incorrect'
Exemple #23
0
if __name__ == '__main__':

    text = tokenize_by_sentence(
        """Hi everyone! Nice to meet you again. What are you doing in my laboratory work?
                                    You are very nice person, do you know it? To be honest, I can't stand programming.
                                    But it doesn't depend on you! It's my personal problem and I don't know how to
                                    solve it... It doesn't matter right now""")

    word_storage = WordStorage()
    word_storage.update(text)

    encoded_text = encode_text(word_storage, text)

    n_gram_trie = NGramTrie(3, encoded_text)

    generator_of_text = NGramTextGenerator(word_storage, n_gram_trie)
    context = word_storage.get_id('on'), word_storage.get_id('you')

    formed_ids = generator_of_text.generate_text(context, 1)
    formed_text = []

    for ids in formed_ids:
        word = word_storage.get_word(ids)
        if word != '<END>':
            formed_text.append(word)

    RESULT = ' '.join(formed_text)
    print(RESULT)
    assert RESULT == 'on you', ''
Exemple #24
0
from lab_4.main import tokenize_by_sentence, WordStorage, encode_text, NGramTextGenerator
from lab_4.ngrams.ngram_trie import NGramTrie

if __name__ == '__main__':
    TEXT = 'I like flowers.\nMy mom likes flowers too.\nHer favourite flower is rose.\nMy favourite flower is rose too.'
    corpus = tokenize_by_sentence(TEXT)

    word_storage = WordStorage()
    word_storage.update(corpus)

    encoded_corpus = encode_text(word_storage, corpus)

    ngrams = NGramTrie(2, encoded_corpus)

    text_generator = NGramTextGenerator(word_storage, ngrams)
    gen_text = text_generator.generate_text((1, ), 2)

    end = word_storage.get_id('<END>')
    actual = gen_text.count(end)
    RESULT = 2
    print(actual)
    assert RESULT == actual, 'not working'
Exemple #25
0
from lab_4.main import decode_text
from lab_4.main import NGramTextGenerator
from lab_4.main import LikelihoodBasedTextGenerator
from lab_4.main import BackOffGenerator

if __name__ == '__main__':
    corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno',
              '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name',
              'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>')
    storage = WordStorage()
    storage.update(corpus)
    encoded = encode_text(storage, corpus)
    trie = ngrams.NGramTrie(2, encoded)
    context = (storage.get_id('i'), )

    generator = NGramTextGenerator(storage, trie)
    actual = generator.generate_text(context, 5)
    actual = decode_text(storage, actual)
    print(actual)

    generator = LikelihoodBasedTextGenerator(storage, trie)
    actual = generator.generate_text(context, 5)
    actual = decode_text(storage, actual)
    print(actual)

    two = ngrams.NGramTrie(2, encoded)
    trie = ngrams.NGramTrie(3, encoded)

    context = (
        storage.get_id('name'),
        storage.get_id('is'),
"""
Lab 4 implementation starter
"""

from lab_4.main import tokenize_by_sentence, WordStorage, encode_text, NGramTextGenerator
from lab_4.ngrams.ngram_trie import NGramTrie

if __name__ == '__main__':
    text = 'I have a dog.\nHis name is Will'
    tokenize_text = tokenize_by_sentence(text)
    print(tokenize_text)

    storage = WordStorage()
    storage.update(tokenize_text)
    print(storage)

    encode = encode_text(storage, tokenize_text)
    print(encode)

    n_gram_trie = NGramTrie(2, encode)
    print(n_gram_trie)
    generator = NGramTextGenerator(storage, n_gram_trie)
    context = (storage.get_id('a'), )
    print(context)

    RESULT = generator.generate_text(context, 3)
    print(RESULT)
    # DO NOT REMOVE NEXT LINE - KEEP IT INTENTIONALLY LAST
    assert RESULT, 'Not working'
Exemple #27
0
from lab_4.main import tokenize_by_sentence
from lab_4.main import WordStorage, LikelihoodBasedTextGenerator
from lab_4.main import encode_text, decode_text
from lab_4.main import NGramTextGenerator

if __name__ == '__main__':
    TEXT = 'I have a cat. His name is Bruno. I have a dog too. His name is Rex. Her name is Rex too.'
    tokenized_text = tokenize_by_sentence(TEXT)

    word_storage = WordStorage()
    word_storage.update(tokenized_text)

    encoded = encode_text(word_storage, tokenized_text)

    trie = NGramTrie(3, encoded)
    context = (
        word_storage.get_id('name'),
        word_storage.get_id('is'),
    )

    generator = NGramTextGenerator(word_storage, trie)
    generated_text = generator.generate_text(context, 2)

    gen_likelihood = LikelihoodBasedTextGenerator(word_storage, trie)
    gen_text = gen_likelihood.generate_text(context, 2)
    decoded_text = decode_text(word_storage, gen_text)

    RESULT = decoded_text

    assert RESULT == ('Name is rex', 'Her name is rex'), "Not working"
Exemple #28
0
from lab_4.ngrams.ngram_trie import NGramTrie
from lab_4.main import tokenize_by_sentence
from lab_4.main import WordStorage
from lab_4.main import encode_text
from lab_4.main import NGramTextGenerator

if __name__ == '__main__':
    text = "This is a dog. It likes running. This is a cat. It likes sleeping. Everyone likes sleeping too."
    text_in_tokens = tokenize_by_sentence(text)

    word_storage = WordStorage()
    word_storage.update(text_in_tokens)

    encoded_text = encode_text(word_storage, text_in_tokens)

    n_gram_trie = NGramTrie(2, encoded_text)
    context = (word_storage.get_id('likes'),)
    text_generator = NGramTextGenerator(word_storage, n_gram_trie)

    RESULT = text_generator.generate_text(context, 4)

    print(RESULT)

    assert RESULT, "Someting went worng.."