def main(): text = ('I have a cat. His name is Bruno. ' 'I have a dog too. His name is Rex. ' 'Her name is Rex too.') corpus = tokenize_by_sentence(text) storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) two = NGramTrie(2, encoded) trie = NGramTrie(3, encoded) context = ( storage.get_id('name'), storage.get_id('is'), ) generator = BackOffGenerator(storage, trie, two) expected = 'rex' actual = storage.get_word(generator._generate_next_word(context)) print(f'TEXT:\n{text}') print(f'\nEXPECTED WORD AFTER name is IS {expected}') print(f'ACTUAL WORD AFTER name is IS {actual}') save_model(generator, 'model.txt') load_model('model.txt') return actual == expected
def test_most_freq_word_end(self): corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) five = NGramTrie(5, encoded) trie = NGramTrie(3, encoded) four = NGramTrie(4, encoded) expected_word = storage.get_id('<END>') context = ( storage.get_id('his'), storage.get_id('name'), storage.get_id('is'), storage.get_id('bruno'), ) generator = BackOffGenerator(storage, five, trie, four) actual = generator.most_freq_word(context) self.assertEqual(expected_word, actual)
def test_generate_next_word_context_incorrect(self): corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) four = NGramTrie(4, encoded) trie = NGramTrie(3, encoded) two = NGramTrie(2, encoded) expected_word = storage.get_id('rex') context = (storage.get_id('name'), storage.get_id('is'), storage.get_id('cat')) generator = BackOffGenerator(storage, four, two, trie) actual = generator._generate_next_word(context) self.assertEqual(expected_word, actual)
def test_text_generator_no_context(self): """ checks if the program can generate sentences without given context """ corpus = ('cat', 'has', 'paws', '<END>', 'dogs', 'have', 'noses', '<END>', 'cat', 'has', 'whiskers', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(3, encoded) two = NGramTrie(2, encoded) four = NGramTrie(4, encoded) context = ( storage.get_id('cat'), storage.get_id('dogs'), ) generator = BackOffGenerator(storage, trie, two, four) actual = generator.generate_text(context, 3) self.assertTrue(all(actual))
def realize_backoff_generator(text): backoff_storage = WordStorage() backoff_storage.update(text) backoff_encoded = encode_text(backoff_storage, text) two = NGramTrie(2, backoff_encoded) trie = NGramTrie(3, backoff_encoded) backoff_context = (backoff_storage.get_id('if'), backoff_storage.get_id('you'),) backoff_generator = BackOffGenerator(backoff_storage, trie, two) backoff_text_generated = backoff_generator.generate_text(backoff_context, 3) return decode_text(backoff_storage, backoff_text_generated)
def test_most_freq_word_incorrect_context(self): corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(3, encoded) two = NGramTrie(2, encoded) four = NGramTrie(4, encoded) bad_inputs = [[], {}, ( 2000, 1000, ), None, 9, 9.34, True] generator = BackOffGenerator(storage, trie, two, four) for bad_context in bad_inputs: self.assertRaises(ValueError, generator.most_freq_word, bad_context)
def test_backoff_generator_instance_creation(self): """ Checks that class creates correct instance """ word_storage = WordStorage() ngram = NGramTrie(2, ()) generator = BackOffGenerator(word_storage, ngram) self.assertEqual(generator._word_storage, word_storage) self.assertTrue(ngram in generator._n_gram_tries)
def test_generate_next_word_short_context(self): corpus = ('bye', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) four = NGramTrie(4, encoded) trie = NGramTrie(3, encoded) two = NGramTrie(2, encoded) expected_word = storage.get_id('bye') context = (storage.get_id('<END>'),) generator = BackOffGenerator(storage, two, four, trie) actual = generator._generate_next_word(context) self.assertEqual(expected_word, actual)
def test_backoff_generator_instance_creation_complex(self): """ Checks that class creates correct instance with several tries """ word_storage = WordStorage() ngram = NGramTrie(2, ()) three = NGramTrie(3, ()) four = NGramTrie(4, ()) generator = BackOffGenerator(word_storage, ngram, three, four) self.assertEqual(generator._word_storage, word_storage) self.assertTrue(ngram in generator._n_gram_tries) self.assertTrue(three in generator._n_gram_tries) self.assertTrue(four in generator._n_gram_tries)
def test_generate_next_word_no_context(self): corpus = ('i', 'watch', 'a', 'horror', 'movie', '<END>', 'would', 'you', 'like', 'to', 'watch' 'with', 'me', '<END>', 'i', 'do', 'not', 'like', 'such', 'films', '<END>', 'i', 'like', 'to', 'watch', 'drama', 'movies', '<END>', 'bye') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) four = NGramTrie(4, encoded) trie = NGramTrie(3, encoded) two = NGramTrie(2, encoded) expected_word = storage.get_id('<END>') context = (storage.get_id('bye'),) generator = BackOffGenerator(storage, two, four, trie) actual = generator._generate_next_word(context) self.assertEqual(expected_word, actual)
""" Lab 4 implementation starter """ from lab_4.main import BackOffGenerator, encode_text, WordStorage, decode_text, tokenize_by_sentence from lab_4.ngrams.ngram_trie import NGramTrie if __name__ == '__main__': with open('lab_3/Frank_Baum.txt', 'r', encoding='utf-8') as file_frank: corpus = tokenize_by_sentence(file_frank.read()) storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(3, encoded) four = NGramTrie(4, encoded) context = ( storage.get_id('when'), storage.get_id('the'), ) generator = BackOffGenerator(storage, four, trie) generated_text = generator.generate_text(context, 5) RESULT = decode_text(storage, generated_text) # DO NOT REMOVE NEXT LINE - KEEP IT INTENTIONALLY LAST assert RESULT, 'Encoding not working'
storage.update(corpus) encoded = encode_text(storage, corpus) trie = ngrams.NGramTrie(2, encoded) context = (storage.get_id('i'), ) generator = NGramTextGenerator(storage, trie) actual = generator.generate_text(context, 5) actual = decode_text(storage, actual) print(actual) generator = LikelihoodBasedTextGenerator(storage, trie) actual = generator.generate_text(context, 5) actual = decode_text(storage, actual) print(actual) two = ngrams.NGramTrie(2, encoded) trie = ngrams.NGramTrie(3, encoded) context = ( storage.get_id('name'), storage.get_id('is'), ) generator = BackOffGenerator(storage, trie, two) actual = generator.generate_text(context, 5) RESULT = decode_text(storage, actual) print(RESULT) assert RESULT == ('Name is rex', 'Her name is rex', 'Her name is rex', 'Her name is rex', 'Her name is rex')
trie = NGramTrie(4, encoded) context = (storage.get_id('i'), storage.get_id('have'), storage.get_id('a')) generator_likelihood = LikelihoodBasedTextGenerator(storage, trie) generated_text = generator_likelihood.generate_text(context, 3) decoded_gen_text = decode_text(storage, generated_text) print('Likelihood generator generates sentences:') print(*decoded_gen_text, sep='. ', end='.\n') two = NGramTrie(2, encoded) trie = NGramTrie(3, encoded) context = ( storage.get_id('i'), storage.get_id('have'), ) generator_backoff = BackOffGenerator(storage, trie, two) actual = generator_backoff.generate_text(context, 3) RESULT = decode_text(storage, actual) print('Backoff generator generates sentences:') print(*RESULT, sep='. ', end='.\n') assert RESULT == ( 'I have a colourful dog', 'I havent a cat too', 'They have beautiful dogs'), 'Text generator does not work'
""" Lab 4 starter """ from lab_4.main import BackOffGenerator, encode_text, WordStorage, decode_text from lab_4.ngrams.ngram_trie import NGramTrie if __name__ == '__main__': corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>') storage = WordStorage() storage.update(corpus) encoded_text = encode_text(storage, corpus) trie = NGramTrie(3, encoded_text) four = NGramTrie(4, encoded_text) context = ( storage.get_id('his'), storage.get_id('name'), storage.get_id('is'), ) generator = BackOffGenerator(storage, trie, four) text = generator.generate_text(context, 3) actual = decode_text(storage, text) RESULT = ('His name is bruno', 'I have a cat', 'His name is bruno') assert RESULT == actual, 'Not work'