Exemple #1
0
def load_model(path_to_saved_model: str) -> NGramTextGenerator:
    if not isinstance(path_to_saved_model, str):
        raise ValueError

    try:
        with open(path_to_saved_model, 'r') as file:
            model = json.load(file)

        word_storage = WordStorage()
        word_storage.storage = model['word_storage']

        trie = NGramTrie(n_gram_size=int(model['n_gram_trie_size']),
                         encoded_text=('he', ) *
                         int(model['n_gram_trie_size']))
        trie.n_grams = tuple([tuple(n_gram) for n_gram in model['n_grams']])
        trie.n_gram_frequencies = {
            tuple(map(int, key.split(', '))): value
            for key, value in model['n_gram_trie_frequencies'].items()
        }
        trie.uni_grams = {(int(key), ): value
                          for key, value in model['uni_grams'].items()}

        model_generator = NGramTextGenerator(word_storage, trie)

        return model_generator
    except FileNotFoundError as error:
        raise FileNotFoundError from error
Exemple #2
0
    def test_generate_text_large_context(self):
        """
        should generate simple case with three sentences out of small corpus
        """
        corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is',
                  'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>',
                  'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is',
                  'rex', 'too', '<END>')

        storage = WordStorage()
        storage.update(corpus)

        encoded = encode_text(storage, corpus)

        trie = NGramTrie(5, encoded)

        generator = NGramTextGenerator(storage, trie)

        context = (
            storage.get_id('i'),
            storage.get_id('have'),
            storage.get_id('a'),
            storage.get_id('bruno'),
        )
        end = storage.get_id('<END>')
        actual = generator.generate_text(context, 3)
        self.assertEqual(actual.count(end), 3)
Exemple #3
0
    def test_decode_text_ideal(self):
        corpus = ('i', 'have', 'a', 'cat', '<END>',
                  'his', 'name', 'is', 'bruno', '<END>',
                  'i', 'have', 'a', 'dog', 'too', '<END>',
                  'his', 'name', 'is', 'rex', '<END>',
                  'her', 'name', 'is', 'rex', 'too', '<END>')

        storage = WordStorage()
        storage.update(corpus)

        encoded = encode_text(storage, corpus)

        trie = NGramTrie(3, encoded)

        context = (storage.get_id('name'),
                   storage.get_id('is'),)
        end = storage.get_id('<END>')

        generator = LikelihoodBasedTextGenerator(storage, trie)

        to_decode = generator.generate_text(context, 2)
        self.assertEqual(to_decode[-1], end)

        expected = ('Name is rex', 'Her name is rex')
        actual = decode_text(storage, to_decode)
        self.assertEqual(expected, actual)
Exemple #4
0
    def test_decode_text_incorrect_storage(self):
        corpus = ('i', 'have', 'a', 'cat', '<END>',
                  'his', 'name', 'is', 'bruno', '<END>',
                  'i', 'have', 'a', 'dog', 'too', '<END>',
                  'his', 'name', 'is', 'rex', '<END>',
                  'her', 'name', 'is', 'rex', 'too', '<END>')

        storage = WordStorage()
        storage.update(corpus)

        encoded = encode_text(storage, corpus)

        trie = NGramTrie(3, encoded)

        context = (storage.get_id('name'),
                   storage.get_id('is'),)

        generator = LikelihoodBasedTextGenerator(storage, trie)

        to_decode = generator.generate_text(context, 2)

        bad_inputs = [(), [], 123, None, NGramTrie]

        for bad_storage in bad_inputs:
            self.assertRaises(ValueError, decode_text, bad_storage, to_decode)
Exemple #5
0
    def test_decode_text_ideal_conditions(self):
        corpus = ('i', 'have', 'a', 'cat', '<END>',
                  'his', 'name', 'is', 'bruno', '<END>',
                  'i', 'have', 'a', 'dog', 'too', '<END>',
                  'his', 'name', 'is', 'rex', '<END>',
                  'her', 'name', 'is', 'rex', 'too', '<END>')

        storage = WordStorage()
        storage.update(corpus)

        encoded = encode_text(storage, corpus)

        trie = NGramTrie(3, encoded)

        context = (storage.get_id('name'),
                   storage.get_id('is'),)

        generator = LikelihoodBasedTextGenerator(storage, trie)

        to_decode = generator.generate_text(context, 2)
        actual = decode_text(storage, to_decode)

        for sentence in actual:
            self.assertTrue('<END>' not in sentence)
            self.assertTrue(sentence[0].isupper())
            self.assertTrue(sentence[-1].isalpha())
    def test_likelihood_generator_instance_creation(self):
        """
        Checks that class creates correct instance
        """
        word_storage = WordStorage()
        ngram = NGramTrie(2, ())

        generator = LikelihoodBasedTextGenerator(word_storage, ngram)
        self.assertEqual(generator._word_storage, word_storage)
        self.assertEqual(generator._n_gram_trie, ngram)
    def test_ngram_text_generator_generate_next_word(self):
        """
        Checks that next word generates properly
        """
        corpus = ('i', 'have', 'a', 'cat', '<END>', 'i', 'have', 'a', 'bruno',
                  'cat', '<END>')
        word_storage = WordStorage()
        word_storage.update(corpus)
        encoded = encode_text(word_storage, corpus)
        ngram = NGramTrie(3, encoded)

        generator = NGramTextGenerator(word_storage, ngram)
        context = (word_storage.get_id('i'), word_storage.get_id('have'))
        expected = word_storage.get_id('a')
        actual = generator._generate_next_word(context)
        self.assertEqual(expected, actual)
    def test_text_generator_throws_errors(self):
        """
        throws errors with bad inputs
        """
        bad_inputs = [[], {}, None, 9, 9.34, True]
        corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is',
                  'bruno', '<END>')
        word_storage = WordStorage()
        word_storage.update(corpus)
        encoded = encode_text(word_storage, corpus)
        trie = NGramTrie(2, encoded)
        generator = NGramTextGenerator(word_storage, trie)

        for bad_input in bad_inputs:
            self.assertRaises(ValueError, generator.generate_text, bad_input,
                              10)
    def test_ngram_text_generator_generate_next_word_incorrect_context(self):
        """
        Checks that method throws error
        """
        corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is',
                  'bruno', '<END>')
        word_storage = WordStorage()
        word_storage.update(corpus)
        encoded = encode_text(word_storage, corpus)
        ngram = NGramTrie(3, encoded)

        generator = NGramTextGenerator(word_storage, ngram)
        bad_inputs = [[], {}, (3, ), None, 9, 9.34,
                      True]  # (3, ) - it is incorrect sized ngram
        for bad_input in bad_inputs:
            self.assertRaises(ValueError, generator._generate_next_word,
                              bad_input)
    def test_ngram_text_generator_generate_sentence_properly(self):
        """
        generates correct output according to simple case
        """
        corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is',
                  'bruno', '<END>')
        word_storage = WordStorage()
        word_storage.update(corpus)
        encoded = encode_text(word_storage, corpus)
        trie = NGramTrie(2, encoded)
        context = (word_storage.get_id('i'), )

        end = word_storage.get_id('<END>')

        generator = NGramTextGenerator(word_storage, trie)
        actual = generator._generate_sentence(context)
        self.assertEqual(actual[-1], end)
    def test_length_of_sentence(self):
        """
        generates sentences with length less than 20
        """
        corpus = ('i', 'have', 'a', 'cat', 'his', 'name', 'is', 'bruno', 'i',
                  'have', 'a', 'dog', 'too', 'his', 'name', 'is', 'rex', 'her',
                  'name', 'is', 'rex', 'too', 'he', 'funny', '<END>')
        word_storage = WordStorage()
        word_storage.update(corpus)
        encoded = encode_text(word_storage, corpus)
        trie = NGramTrie(2, encoded)
        context = (word_storage.get_id('cat'), )

        generator = NGramTextGenerator(word_storage, trie)
        actual = len(generator._generate_sentence(context))
        expected = len(
            context) + 21  # cause we generate not more than 20 words + end
        self.assertLessEqual(actual, expected)
    def test_ngram_text_generator_generate_sentence_no_end(self):
        """
        should generate '<END>' anyway
        """
        corpus = ('i', 'have', 'a', 'cat', 'his', 'name', 'is', 'bruno', 'i',
                  'have', 'a', 'dog', 'too', 'his', 'name', 'is', 'rex', 'her',
                  'name', 'is', 'rex', 'too', '<END>')
        word_storage = WordStorage()
        word_storage.update(corpus)
        encoded = encode_text(word_storage, corpus)
        trie = NGramTrie(2, encoded)
        context = (word_storage.get_id('cat'), )

        generator = NGramTextGenerator(word_storage, trie)
        actual = generator._generate_sentence(context)

        expected = '<END>'
        actual = word_storage.get_word(actual[-1])
        self.assertEqual(expected, actual)
Exemple #13
0
    def test_generate_next_word_incorrect_context(self):
        corpus = ('i', 'have', 'a', 'cat', '<END>',
                  'his', 'name', 'is', 'bruno', '<END>',
                  'i', 'have', 'a', 'dog', 'too', '<END>',
                  'his', 'name', 'is', 'rex', '<END>',
                  'her', 'name', 'is', 'rex', 'too', '<END>')

        storage = WordStorage()
        storage.update(corpus)

        encoded = encode_text(storage, corpus)

        trie = NGramTrie(3, encoded)

        bad_inputs = [[], {}, (2000, 1000, ), None, 9, 9.34, True]

        generator = LikelihoodBasedTextGenerator(storage, trie)

        for bad_context in bad_inputs:
            self.assertRaises(ValueError, generator._generate_next_word, bad_context)
    def test_generate_next_word_larger_context(self):
        corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is',
                  'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>',
                  'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is',
                  'rex', 'too', '<END>')

        storage = WordStorage()
        storage.update(corpus)

        encoded = encode_text(storage, corpus)

        trie = NGramTrie(4, encoded)

        expected_word = storage.get_id('bruno')
        context = (storage.get_id('his'), storage.get_id('name'),
                   storage.get_id('is'))

        generator = LikelihoodBasedTextGenerator(storage, trie)

        actual = generator._generate_next_word(context)
        self.assertEqual(expected_word, actual)
Exemple #15
0
    def test_calculate_likelihood_incorrect_context(self):
        corpus = ('i', 'have', 'a', 'cat', '<END>',
                  'his', 'name', 'is', 'bruno', '<END>',
                  'i', 'have', 'a', 'dog', 'too', '<END>',
                  'his', 'name', 'is', 'rex', '<END>',
                  'her', 'name', 'is', 'rex', 'too', '<END>')

        storage = WordStorage()
        storage.update(corpus)
        encoded = encode_text(storage, corpus)
        trie = NGramTrie(2, encoded)

        bad_inputs = [[], {}, (2000, 1000, ), None, 9, 9.34, True]  # (2000, 1000, ) -> context for three gram
        word = storage.get_id('dog')

        generator = LikelihoodBasedTextGenerator(storage, trie)

        for bad_context in bad_inputs:
            self.assertRaises(ValueError,
                              generator._calculate_maximum_likelihood,
                              word, bad_context)
    def test_ngram_text_generator_generate_next_word_no_such_context(self):
        """
        Checks that next word generates properly if no context found
        """
        corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is',
                  'bruno', '<END>')
        word_storage = WordStorage()
        word_storage.update(corpus)

        encoded = encode_text(word_storage, corpus)

        ngram = NGramTrie(3, encoded)

        generator = NGramTextGenerator(word_storage, ngram)
        context = (
            word_storage.get_id('i'),
            word_storage.get_id('name'),
        )  # there is no such context in ngrams, so return most frequent option
        expected_top_freq = word_storage.get_id('<END>')  # as it appears twice
        actual = generator._generate_next_word(context)
        self.assertEqual(expected_top_freq, actual)
    def test_ngram_text_generator_generate_sentence_ideal(self):
        """
        first and last generated words as expected
        """
        corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is',
                  'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>',
                  'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is',
                  'rex', 'too', '<END>')
        storage = WordStorage()
        storage.update(corpus)
        encoded = encode_text(storage, corpus)
        trie = NGramTrie(2, encoded)
        context = (storage.get_id('i'), )

        first_generated = storage.get_id('have')
        last_generated = storage.get_id('<END>')

        generator = NGramTextGenerator(storage, trie)
        actual = generator._generate_sentence(context)
        self.assertEqual(actual[1], first_generated)
        self.assertEqual(actual[-1], last_generated)
Exemple #18
0
    def test_calculate_likelihood_no_such_context(self):
        corpus = ('i', 'have', 'a', 'cat', '<END>',
                  'his', 'name', 'is', 'bruno', '<END>',
                  'i', 'have', 'a', 'dog', 'too', '<END>',
                  'his', 'name', 'is', 'rex', '<END>',
                  'her', 'name', 'is', 'rex', 'too', '<END>')

        storage = WordStorage()
        storage.update(corpus)

        encoded = encode_text(storage, corpus)

        trie = NGramTrie(3, encoded)
        word = storage.get_id('dog')
        context = (storage.get_id('<END>'),
                   storage.get_id('<END>'),)

        generator = LikelihoodBasedTextGenerator(storage, trie)

        expected = 0.0
        actual = generator._calculate_maximum_likelihood(word, context)
        self.assertEqual(expected, actual)
Exemple #19
0
    def test_calculate_likelihood_incorrect_word(self):
        corpus = ('i', 'have', 'a', 'cat', '<END>',
                  'his', 'name', 'is', 'bruno', '<END>',
                  'i', 'have', 'a', 'dog', 'too', '<END>',
                  'his', 'name', 'is', 'rex', '<END>',
                  'her', 'name', 'is', 'rex', 'too', '<END>')

        storage = WordStorage()
        storage.update(corpus)
        encoded = encode_text(storage, corpus)
        trie = NGramTrie(2, encoded)

        bad_inputs = [(), [], None, 123]
        context = (storage.get_id('have'),
                   storage.get_id('a'),)

        generator = LikelihoodBasedTextGenerator(storage, trie)

        for bad_word in bad_inputs:
            self.assertRaises(ValueError,
                              generator._calculate_maximum_likelihood,
                              bad_word, context)
Exemple #20
0
def load_model(path_to_saved_model: str) -> NGramTextGenerator:
    if not isinstance(path_to_saved_model, str):
        raise ValueError

    with open(path_to_saved_model + '.json', 'r') as json_file:
        generator_json = json.load(json_file)
        words = WordStorage()
        words.storage = generator_json['_word_storage']['storage']
        trie = NGramTrie(generator_json['_n_gram_trie']['size'], (0, 1))
        trie.encoded_text = generator_json['_n_gram_trie']['encoded_text']
        trie.n_grams = tuple(
            tuple(gram) for gram in generator_json['_n_gram_trie']['n_grams'])
        trie.n_gram_frequencies = {
            eval(key): value
            for key, value in generator_json['_n_gram_trie']
            ['n_gram_frequencies'].items()
        }
        trie.uni_grams = {
            eval(key): value
            for key, value in generator_json['_n_gram_trie']
            ['uni_grams'].items()
        }
        return NGramTextGenerator(words, trie)
Exemple #21
0
"""
Lab 4
"""

from main import *

from ngrams.ngram_trie import NGramTrie

if __name__ == '__main__':
    corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno',
              '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name',
              'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>')
    storage = WordStorage()
    storage.update(corpus)
    encoded = encode_text(storage, corpus)
    trie = NGramTrie(2, encoded)
    context = (storage.get_id('i'), )

    first_generated = storage.get_id('have')
    last_generated = storage.get_id('<END>')

    generator = NGramTextGenerator(storage, trie)
    actual = generator._generate_sentence(context)
    # DO NOT REMOVE NEXT LINE - KEEP IT INTENTIONALLY LAST
    RESULT = 0
    if actual[1] == first_generated:
        RESULT = 1
    assert RESULT == 1, ''
Exemple #22
0
'''

from ngrams.ngram_trie import NGramTrie
from lab_4.main import encode_text, WordStorage, LikelihoodBasedTextGenerator, decode_text

if __name__ == '__main__':
    corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno',
              '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name',
              'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>')

    storage = WordStorage()
    storage.update(corpus)

    encoded = encode_text(storage, corpus)

    trie = NGramTrie(3, encoded)

    context = (
        storage.get_id('name'),
        storage.get_id('is'),
    )
    end = storage.get_id('<END>')

    generator = LikelihoodBasedTextGenerator(storage, trie)

    to_decode = generator.generate_text(context, 2)

    RESULT = decode_text(storage, to_decode)
    print(RESULT)
    assert RESULT == ('Name is rex', 'Her name is rex')
Exemple #23
0

print("GENERATE WORD WITH LIKELIHOOD")

corpus = ('i', 'have', 'a', 'cat', '<END>',
          'his', 'name', 'is', 'bruno', '<END>',
          'i', 'have', 'a', 'dog', 'too', '<END>',
          'his', 'name', 'is', 'rex', '<END>',
          'her', 'name', 'is', 'rex', 'too', '<END>')

storage = WordStorage()
storage.update(corpus)

encoded = encode_text(storage, corpus)

trie = NGramTrie(3, encoded)
context = (storage.get_id('have'),
           storage.get_id('a'),)
generator = LikelihoodBasedTextGenerator(storage, trie)

generated_word = generator._generate_next_word(context)
print(f"generated word in context {generated_word}")
print("-------------------------------------------")


print("BACKOFF GENERATOR")

two = NGramTrie(2, encoded)
trie = NGramTrie(3, encoded)

context = (storage.get_id('name'),