def test_end(self): """ Checks that after decoding no end in result """ corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'cat', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(3, encoded) context = ( storage.get_id('a'), storage.get_id('cat'), ) generator = LikelihoodBasedTextGenerator(storage, trie) to_decode = generator.generate_text(context, 1) actual = decode_text(storage, to_decode) expected = ('A cat', ) self.assertEqual(expected, actual)
def test_decode_text_ideal_conditions(self): corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(3, encoded) context = ( storage.get_id('name'), storage.get_id('is'), ) generator = LikelihoodBasedTextGenerator(storage, trie) to_decode = generator.generate_text(context, 2) actual = decode_text(storage, to_decode) for sentence in actual: self.assertTrue('<END>' not in sentence) self.assertTrue(sentence[0].isupper()) self.assertTrue(sentence[-1].isalpha())
def test_load_model_ideal(self): corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(2, encoded) generator = NGramTextGenerator(storage, trie) save_model(generator, 'my_awesome_model') loaded_model = load_model('my_awesome_model') self.assertEqual(generator._n_gram_trie.n_grams, loaded_model._n_gram_trie.n_grams) self.assertEqual(len(generator._n_gram_trie.n_gram_frequencies), len(loaded_model._n_gram_trie.n_gram_frequencies)) for ngram, frequency in generator._n_gram_trie.n_gram_frequencies.items( ): self.assertTrue( ngram in loaded_model._n_gram_trie.n_gram_frequencies) self.assertEqual( frequency, loaded_model._n_gram_trie.n_gram_frequencies[ngram]) self.assertEqual(len(generator._word_storage.storage), len(loaded_model._word_storage.storage)) for word, id_num in generator._word_storage.storage.items(): self.assertTrue(word in loaded_model._word_storage.storage) self.assertEqual(id_num, loaded_model._word_storage.storage[word])
def test_generate_text_large_context(self): """ should generate simple case with three sentences out of small corpus """ corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(5, encoded) generator = NGramTextGenerator(storage, trie) context = ( storage.get_id('i'), storage.get_id('have'), storage.get_id('a'), storage.get_id('bruno'), ) end = storage.get_id('<END>') actual = generator.generate_text(context, 3) self.assertEqual(actual.count(end), 3)
def test_text_generator_generate_sentence_proper_beginning(self): """ Checks that class creates correct sentence from a context '<END>' without '<END>' in the beginning """ corpus = ('my', 'favourite', 'subject', 'is', 'maths', '<END>', 'his', 'favourite', 'thing', 'is', 'music' '<END>', 'i', 'have', 'a', 'favourite', 'film', '<END>', 'my', 'family', 'likes', 'avatar', '<END>', 'my', 'favourite', 'subject', 'is', 'music', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(2, encoded) context = (storage.get_id('<END>'), ) first_generated = storage.get_id('my') last_generated = storage.get_id('<END>') generator = NGramTextGenerator(storage, trie) actual = generator._generate_sentence(context) self.assertNotEqual(storage.get_id('<END>'), actual[0]) self.assertEqual(first_generated, actual[0]) self.assertEqual(last_generated, actual[-1])
def test_generate_next_word_context_incorrect(self): corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) four = NGramTrie(4, encoded) trie = NGramTrie(3, encoded) two = NGramTrie(2, encoded) expected_word = storage.get_id('rex') context = (storage.get_id('name'), storage.get_id('is'), storage.get_id('cat')) generator = BackOffGenerator(storage, four, two, trie) actual = generator._generate_next_word(context) self.assertEqual(expected_word, actual)
def test_text_generator_no_context(self): """ checks if the program can generate sentences without given context """ corpus = ('cat', 'has', 'paws', '<END>', 'dogs', 'have', 'noses', '<END>', 'cat', 'has', 'whiskers', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(3, encoded) two = NGramTrie(2, encoded) four = NGramTrie(4, encoded) context = ( storage.get_id('cat'), storage.get_id('dogs'), ) generator = BackOffGenerator(storage, trie, two, four) actual = generator.generate_text(context, 3) self.assertTrue(all(actual))
def test_load_model_takes_less_time(self): with open('lab_1/data.txt', 'r', encoding='utf-8') as big_file: big_data = big_file.read() tokenized_data = tokenize_by_sentence(big_data) storage = WordStorage() storage.update(tokenized_data) context = ( storage.get_id('despite'), storage.get_id('the'), ) start_time_generate = time() encoded = encode_text(storage, tokenized_data) trie = NGramTrie(3, encoded) generator = NGramTextGenerator(storage, trie) generated_text = generator.generate_text(context, 3) end_time_generate = time() - start_time_generate save_model(generator, 'model_training') start_time_saved = time() loaded_model = load_model('model_training') new_result = loaded_model.generate_text(context, 3) end_time_saved = time() - start_time_saved self.assertGreater(end_time_generate, end_time_saved) self.assertEqual(generated_text, new_result)
def test_most_freq_word_end(self): corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) five = NGramTrie(5, encoded) trie = NGramTrie(3, encoded) four = NGramTrie(4, encoded) expected_word = storage.get_id('<END>') context = ( storage.get_id('his'), storage.get_id('name'), storage.get_id('is'), storage.get_id('bruno'), ) generator = BackOffGenerator(storage, five, trie, four) actual = generator.most_freq_word(context) self.assertEqual(expected_word, actual)
def test_most_freq_word_incorrect_context(self): corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(3, encoded) two = NGramTrie(2, encoded) four = NGramTrie(4, encoded) bad_inputs = [[], {}, ( 2000, 1000, ), None, 9, 9.34, True] generator = BackOffGenerator(storage, trie, two, four) for bad_context in bad_inputs: self.assertRaises(ValueError, generator.most_freq_word, bad_context)
def test_decode_text_ideal(self): corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(3, encoded) context = ( storage.get_id('name'), storage.get_id('is'), ) print('Я ТЕСТ', context) end = storage.get_id('<END>') generator = LikelihoodBasedTextGenerator(storage, trie) to_decode = generator.generate_text(context, 2) self.assertEqual(to_decode[-1], end) expected = ('Name is rex', 'Her name is rex') actual = decode_text(storage, to_decode) self.assertEqual(expected, actual)
def main(): text = ('I have a cat. His name is Bruno. ' 'I have a dog too. His name is Rex. ' 'Her name is Rex too.') corpus = tokenize_by_sentence(text) storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) two = NGramTrie(2, encoded) trie = NGramTrie(3, encoded) context = ( storage.get_id('name'), storage.get_id('is'), ) generator = BackOffGenerator(storage, trie, two) expected = 'rex' actual = storage.get_word(generator._generate_next_word(context)) print(f'TEXT:\n{text}') print(f'\nEXPECTED WORD AFTER name is IS {expected}') print(f'ACTUAL WORD AFTER name is IS {actual}') save_model(generator, 'model.txt') load_model('model.txt') return actual == expected
def test_decode_text_incorrect_storage(self): corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(3, encoded) context = ( storage.get_id('name'), storage.get_id('is'), ) generator = LikelihoodBasedTextGenerator(storage, trie) to_decode = generator.generate_text(context, 2) bad_inputs = [(), [], 123, None, NGramTrie] for bad_storage in bad_inputs: self.assertRaises(ValueError, decode_text, bad_storage, to_decode)
def realize_n_gram_text_generator(text): n_gram_storage = WordStorage() n_gram_storage.update(text) n_gram_context = (n_gram_storage.get_id('my'), n_gram_storage.get_id('dear')) n_gram_encoded = encode_text(n_gram_storage, text) n_gram_trie = NGramTrie(3, n_gram_encoded) n_gram_generator = NGramTextGenerator(n_gram_storage, n_gram_trie) n_gram_text_generated = n_gram_generator.generate_text(n_gram_context, 3) return decode_text(n_gram_storage, n_gram_text_generated)
def test_encode_text_empty_sentence(self): """ Tests that encode_corpus function can handle empty sentence input """ word_storage = WordStorage() corpus = () expected = () word_storage.update(corpus) actual = encode_text(word_storage, corpus) self.assertEqual(expected, actual)
def realize_backoff_generator(text): backoff_storage = WordStorage() backoff_storage.update(text) backoff_encoded = encode_text(backoff_storage, text) two = NGramTrie(2, backoff_encoded) trie = NGramTrie(3, backoff_encoded) backoff_context = (backoff_storage.get_id('if'), backoff_storage.get_id('you'),) backoff_generator = BackOffGenerator(backoff_storage, trie, two) backoff_text_generated = backoff_generator.generate_text(backoff_context, 3) return decode_text(backoff_storage, backoff_text_generated)
def test_ngram_text_generator_duplicates_words(self): corpus = ('stop', 'it', 'stop', 'it', 'stop', 'it', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(2, encoded) context = (storage.get_id('stop'), ) generator = NGramTextGenerator(storage, trie) actual = generator._generate_sentence(context) self.assertEqual(20 + len(context) + 1, len(actual))
def test_save_model_incorrect_path(self): corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(2, encoded) generator = NGramTextGenerator(storage, trie) self.assertRaises(FileNotFoundError, save_model, generator, r'some_folder/some_file')
def test_context_end(self): """ checks if <END> is in the context """ context = ('cat', '<END>') corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>') word_storage = WordStorage() word_storage.update(corpus) encoded = encode_text(word_storage, corpus) trie = NGramTrie(3, encoded) generator = NGramTextGenerator(word_storage, trie) self.assertRaises(ValueError, generator._generate_sentence, context)
def test_encode_text_same_words_count(self): """ Tests that encode_text function can assign correct id to the same words """ word_storage = WordStorage() corpus = ('i', 'have', 'a', 'cat', '<END>', 'i', 'have', 'a', 'cat', '<END>') word_storage.update(corpus) actual = encode_text(word_storage, corpus) self.assertEqual(actual[:5], actual[5:])
def test_ngram_text_generator_identical_words(self): corpus = ('deadline', 'deadline', 'deadline', 'deadline', 'deadline', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(3, encoded) context = (storage.get_id('deadline'), storage.get_id('deadline')) generator = NGramTextGenerator(storage, trie) actual = generator._generate_sentence(context) self.assertEqual(20 + len(context) + 1, len(actual)) # +1 it is for <END>
def test_encode_text_ideal(self): """ Tests that encode_text function generates id for each word """ word_storage = WordStorage() corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>') word_storage.update(corpus) actual = encode_text(word_storage, corpus) for token in actual: self.assertTrue(isinstance(token, int))
def test_ngram_text_generator_bad_num_input(self): # new test """ throws errors with bad inputs """ bad_inputs = (-5, 0, -2, -1, -8) corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>') word_storage = WordStorage() word_storage.update(corpus) encoded = encode_text(word_storage, corpus) trie = NGramTrie(2, encoded) generator = NGramTextGenerator(word_storage, trie) for bad_input in bad_inputs: self.assertRaises(ValueError, generator._generate_sentence, bad_input)
def test_get_most_frequent_gram_bad_inputs(self): """ Checks that method returns empty tuple """ corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>') word_storage = WordStorage() word_storage.update(corpus) encoded = encode_text(word_storage, corpus) ngram = NGramTrie(3, encoded) expected = () generator = NGramTextGenerator(word_storage, ngram) bad_inputs = [[], {}, (), None, 9, 9.34, True] for bad_input in bad_inputs: self.assertEqual(expected, generator.get_most_frequent_gram(bad_input))
def test_text_generator_throws_errors(self): """ throws errors with bad inputs """ bad_inputs = [[], {}, None, 9, 9.34, True] corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>') word_storage = WordStorage() word_storage.update(corpus) encoded = encode_text(word_storage, corpus) trie = NGramTrie(2, encoded) generator = NGramTextGenerator(word_storage, trie) for bad_input in bad_inputs: self.assertRaises(ValueError, generator.generate_text, bad_input, 10)
def test_ngram_text_generator_generate_next_word(self): """ Checks that next word generates properly """ corpus = ('i', 'have', 'a', 'cat', '<END>', 'i', 'have', 'a', 'bruno', 'cat', '<END>') word_storage = WordStorage() word_storage.update(corpus) encoded = encode_text(word_storage, corpus) ngram = NGramTrie(3, encoded) generator = NGramTextGenerator(word_storage, ngram) context = (word_storage.get_id('i'), word_storage.get_id('have')) expected = word_storage.get_id('a') actual = generator._generate_next_word(context) self.assertEqual(expected, actual)
def test_ngram_text_generator_generate_sentence_properly(self): """ generates correct output according to simple case """ corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>') word_storage = WordStorage() word_storage.update(corpus) encoded = encode_text(word_storage, corpus) trie = NGramTrie(2, encoded) context = (word_storage.get_id('i'), ) end = word_storage.get_id('<END>') generator = NGramTextGenerator(word_storage, trie) actual = generator._generate_sentence(context) self.assertEqual(actual[-1], end)
def test_ngram_text_generator_generate_next_word_incorrect_context(self): """ Checks that method throws error """ corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>') word_storage = WordStorage() word_storage.update(corpus) encoded = encode_text(word_storage, corpus) ngram = NGramTrie(3, encoded) generator = NGramTextGenerator(word_storage, ngram) bad_inputs = [[], {}, (3, ), None, 9, 9.34, True] # (3, ) - it is incorrect sized ngram for bad_input in bad_inputs: self.assertRaises(ValueError, generator._generate_next_word, bad_input)
def test_get_most_frequent_gram_ideal(self): """ Checks that most frequent ngram gets properly """ corpus = ('i', 'have', 'a', 'cat', '<END>', 'i', 'have', 'a', 'bruno', 'cat', '<END>') word_storage = WordStorage() word_storage.update(corpus) encoded = encode_text(word_storage, corpus) ngram = NGramTrie(3, encoded) generator = NGramTextGenerator(word_storage, ngram) context = (word_storage.get_id('i'), word_storage.get_id('have')) expected = (word_storage.get_id('i'), word_storage.get_id('have'), word_storage.get_id('a')) actual = generator.get_most_frequent_gram(context) self.assertEqual(expected, actual)
def test_length_of_sentence(self): """ generates sentences with length less than 20 """ corpus = ('i', 'have', 'a', 'cat', 'his', 'name', 'is', 'bruno', 'i', 'have', 'a', 'dog', 'too', 'his', 'name', 'is', 'rex', 'her', 'name', 'is', 'rex', 'too', 'he', 'funny', '<END>') word_storage = WordStorage() word_storage.update(corpus) encoded = encode_text(word_storage, corpus) trie = NGramTrie(2, encoded) context = (word_storage.get_id('cat'), ) generator = NGramTextGenerator(word_storage, trie) actual = len(generator._generate_sentence(context)) expected = len( context) + 21 # cause we generate not more than 20 words + end self.assertLessEqual(actual, expected)