def main(): text = ('I have a cat. His name is Bruno. ' 'I have a dog too. His name is Rex. ' 'Her name is Rex too.') corpus = tokenize_by_sentence(text) storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) two = NGramTrie(2, encoded) trie = NGramTrie(3, encoded) context = ( storage.get_id('name'), storage.get_id('is'), ) generator = BackOffGenerator(storage, trie, two) expected = 'rex' actual = storage.get_word(generator._generate_next_word(context)) print(f'TEXT:\n{text}') print(f'\nEXPECTED WORD AFTER name is IS {expected}') print(f'ACTUAL WORD AFTER name is IS {actual}') save_model(generator, 'model.txt') load_model('model.txt') return actual == expected
def test_load_model_takes_less_time(self): with open('lab_1/data.txt', 'r', encoding='utf-8') as big_file: big_data = big_file.read() tokenized_data = tokenize_by_sentence(big_data) storage = WordStorage() storage.update(tokenized_data) context = ( storage.get_id('despite'), storage.get_id('the'), ) start_time_generate = time() encoded = encode_text(storage, tokenized_data) trie = NGramTrie(3, encoded) generator = NGramTextGenerator(storage, trie) generated_text = generator.generate_text(context, 3) end_time_generate = time() - start_time_generate save_model(generator, 'model_training') start_time_saved = time() loaded_model = load_model('model_training') new_result = loaded_model.generate_text(context, 3) end_time_saved = time() - start_time_saved self.assertGreater(end_time_generate, end_time_saved) self.assertEqual(generated_text, new_result)
def test_load_model_ideal(self): corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(2, encoded) generator = NGramTextGenerator(storage, trie) save_model(generator, 'my_awesome_model') loaded_model = load_model('my_awesome_model') self.assertEqual(generator._n_gram_trie.n_grams, loaded_model._n_gram_trie.n_grams) self.assertEqual(len(generator._n_gram_trie.n_gram_frequencies), len(loaded_model._n_gram_trie.n_gram_frequencies)) for ngram, frequency in generator._n_gram_trie.n_gram_frequencies.items( ): self.assertTrue( ngram in loaded_model._n_gram_trie.n_gram_frequencies) self.assertEqual( frequency, loaded_model._n_gram_trie.n_gram_frequencies[ngram]) self.assertEqual(len(generator._word_storage.storage), len(loaded_model._word_storage.storage)) for word, id_num in generator._word_storage.storage.items(): self.assertTrue(word in loaded_model._word_storage.storage) self.assertEqual(id_num, loaded_model._word_storage.storage[word])
def realize_likelihood_generator(text): likelihood_storage = WordStorage() likelihood_storage.update(text) context = (likelihood_storage.get_id('i'), likelihood_storage.get_id('shall'),) model = load_model('lab_4/likelihood_model.json') generator = LikelihoodBasedTextGenerator(model.word_storage, model.n_gram_trie) likelihood_text_generated = generator.generate_text(context, 3) return decode_text(likelihood_storage, likelihood_text_generated)
def test_load_model_has_generator_methods(self): corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(2, encoded) generator = NGramTextGenerator(storage, trie) save_model(generator, 'my_awesome_model') loaded_model = load_model('my_awesome_model') self.assertEquals(dir(loaded_model), dir(generator))