def test_decode_text_ideal_conditions(self): corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(3, encoded) context = ( storage.get_id('name'), storage.get_id('is'), ) generator = LikelihoodBasedTextGenerator(storage, trie) to_decode = generator.generate_text(context, 2) actual = decode_text(storage, to_decode) for sentence in actual: self.assertTrue('<END>' not in sentence) self.assertTrue(sentence[0].isupper()) self.assertTrue(sentence[-1].isalpha())
def test_end(self): """ Checks that after decoding no end in result """ corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'cat', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(3, encoded) context = ( storage.get_id('a'), storage.get_id('cat'), ) generator = LikelihoodBasedTextGenerator(storage, trie) to_decode = generator.generate_text(context, 1) actual = decode_text(storage, to_decode) expected = ('A cat', ) self.assertEqual(expected, actual)
def test_decode_text_ideal(self): corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(3, encoded) context = ( storage.get_id('name'), storage.get_id('is'), ) print('Я ТЕСТ', context) end = storage.get_id('<END>') generator = LikelihoodBasedTextGenerator(storage, trie) to_decode = generator.generate_text(context, 2) self.assertEqual(to_decode[-1], end) expected = ('Name is rex', 'Her name is rex') actual = decode_text(storage, to_decode) self.assertEqual(expected, actual)
def realize_n_gram_text_generator(text): n_gram_storage = WordStorage() n_gram_storage.update(text) n_gram_context = (n_gram_storage.get_id('my'), n_gram_storage.get_id('dear')) n_gram_encoded = encode_text(n_gram_storage, text) n_gram_trie = NGramTrie(3, n_gram_encoded) n_gram_generator = NGramTextGenerator(n_gram_storage, n_gram_trie) n_gram_text_generated = n_gram_generator.generate_text(n_gram_context, 3) return decode_text(n_gram_storage, n_gram_text_generated)
def realize_likelihood_generator(text): likelihood_storage = WordStorage() likelihood_storage.update(text) context = (likelihood_storage.get_id('i'), likelihood_storage.get_id('shall'),) model = load_model('lab_4/likelihood_model.json') generator = LikelihoodBasedTextGenerator(model.word_storage, model.n_gram_trie) likelihood_text_generated = generator.generate_text(context, 3) return decode_text(likelihood_storage, likelihood_text_generated)
def test_decode_text_empty_sentence(self): """ Tests that decode_corpus function can handle empty sentence input """ word_storage = WordStorage() corpus = () expected = () word_storage.update(corpus) actual = decode_text(word_storage, corpus) self.assertEqual(expected, actual)
def realize_backoff_generator(text): backoff_storage = WordStorage() backoff_storage.update(text) backoff_encoded = encode_text(backoff_storage, text) two = NGramTrie(2, backoff_encoded) trie = NGramTrie(3, backoff_encoded) backoff_context = (backoff_storage.get_id('if'), backoff_storage.get_id('you'),) backoff_generator = BackOffGenerator(backoff_storage, trie, two) backoff_text_generated = backoff_generator.generate_text(backoff_context, 3) return decode_text(backoff_storage, backoff_text_generated)
def test_decode_text_upper_first_letter(self): ''' Tests that number all the letters except first one in a sentence are in a lower case ''' corpus = ('first', 'sentence', 'here', '<END>', 'second', 'sentence', 'here', '<END>', 'third', 'sentence', 'here', '<END>') storage = WordStorage() storage.update(corpus) encoded_text = encode_text(storage, corpus) trie = NGramTrie(3, encoded_text) context = (storage.get_id('first'), storage.get_id('sentence')) likelihood_generator = LikelihoodBasedTextGenerator(storage, trie) generated_encoded_text = likelihood_generator.generate_text(context, 1) decoded_text = decode_text(storage, generated_encoded_text) self.assertFalse(decoded_text[0][1:].isupper())
""" Lab 4 implementation starter """ from lab_4.main import BackOffGenerator, encode_text, WordStorage, decode_text, tokenize_by_sentence from lab_4.ngrams.ngram_trie import NGramTrie if __name__ == '__main__': with open('lab_3/Frank_Baum.txt', 'r', encoding='utf-8') as file_frank: corpus = tokenize_by_sentence(file_frank.read()) storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(3, encoded) four = NGramTrie(4, encoded) context = ( storage.get_id('when'), storage.get_id('the'), ) generator = BackOffGenerator(storage, four, trie) generated_text = generator.generate_text(context, 5) RESULT = decode_text(storage, generated_text) # DO NOT REMOVE NEXT LINE - KEEP IT INTENTIONALLY LAST assert RESULT, 'Encoding not working'
from lab_4.main import LikelihoodBasedTextGenerator from lab_4.main import BackOffGenerator if __name__ == '__main__': corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = ngrams.NGramTrie(2, encoded) context = (storage.get_id('i'), ) generator = NGramTextGenerator(storage, trie) actual = generator.generate_text(context, 5) actual = decode_text(storage, actual) print(actual) generator = LikelihoodBasedTextGenerator(storage, trie) actual = generator.generate_text(context, 5) actual = decode_text(storage, actual) print(actual) two = ngrams.NGramTrie(2, encoded) trie = ngrams.NGramTrie(3, encoded) context = ( storage.get_id('name'), storage.get_id('is'), )
""" from lab_4.main import LikelihoodBasedTextGenerator, encode_text, WordStorage, decode_text from lab_4.ngrams.ngram_trie import NGramTrie if __name__ == '__main__': corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(3, encoded) context = ( storage.get_id('name'), storage.get_id('is'), ) end = storage.get_id('<END>') generator = LikelihoodBasedTextGenerator(storage, trie) to_decode = generator.generate_text(context, 2) EXPECTED = ('Name is rex', 'Her name is rex') RESULT = decode_text(storage, to_decode) assert RESULT == EXPECTED, 'Encoding not working'
from lab_4.main import tokenize_by_sentence from lab_4.main import WordStorage, LikelihoodBasedTextGenerator from lab_4.main import encode_text, decode_text from lab_4.main import NGramTextGenerator if __name__ == '__main__': TEXT = 'I have a cat. His name is Bruno. I have a dog too. His name is Rex. Her name is Rex too.' tokenized_text = tokenize_by_sentence(TEXT) word_storage = WordStorage() word_storage.update(tokenized_text) encoded = encode_text(word_storage, tokenized_text) trie = NGramTrie(3, encoded) context = ( word_storage.get_id('name'), word_storage.get_id('is'), ) generator = NGramTextGenerator(word_storage, trie) generated_text = generator.generate_text(context, 2) gen_likelihood = LikelihoodBasedTextGenerator(word_storage, trie) gen_text = gen_likelihood.generate_text(context, 2) decoded_text = decode_text(word_storage, gen_text) RESULT = decoded_text assert RESULT == ('Name is rex', 'Her name is rex'), "Not working"
'too', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(4, encoded) context = (storage.get_id('i'), storage.get_id('have'), storage.get_id('a')) generator_likelihood = LikelihoodBasedTextGenerator(storage, trie) generated_text = generator_likelihood.generate_text(context, 3) decoded_gen_text = decode_text(storage, generated_text) print('Likelihood generator generates sentences:') print(*decoded_gen_text, sep='. ', end='.\n') two = NGramTrie(2, encoded) trie = NGramTrie(3, encoded) context = ( storage.get_id('i'), storage.get_id('have'), ) generator_backoff = BackOffGenerator(storage, trie, two) actual = generator_backoff.generate_text(context, 3) RESULT = decode_text(storage, actual)
""" Lab 4 starter """ from lab_4.main import BackOffGenerator, encode_text, WordStorage, decode_text from lab_4.ngrams.ngram_trie import NGramTrie if __name__ == '__main__': corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>') storage = WordStorage() storage.update(corpus) encoded_text = encode_text(storage, corpus) trie = NGramTrie(3, encoded_text) four = NGramTrie(4, encoded_text) context = ( storage.get_id('his'), storage.get_id('name'), storage.get_id('is'), ) generator = BackOffGenerator(storage, trie, four) text = generator.generate_text(context, 3) actual = decode_text(storage, text) RESULT = ('His name is bruno', 'I have a cat', 'His name is bruno') assert RESULT == actual, 'Not work'
cannot have egg bacon spam and sausage without the spam. I do not like spam! Sshh, dear, do not cause a fuss. I will have your spam. I love it. I am having spam beaked beans spam and spam! Lovely spam! Wonderful spam! Shut up! Baked beans are off. Well could I have her spam instead of the baked beans then?''' corpus = tokenize_by_sentence(TEXT) storage = WordStorage() storage.update(corpus) encoded_text = encode_text(storage, corpus) n_gram_trie = NGramTrie(3, encoded_text) generator = LikelihoodBasedTextGenerator(storage, n_gram_trie) context = (storage.get_id('bloody'), storage.get_id('vikings')) generated_text = generator.generate_text(context, 5) decoded_text = decode_text(storage, generated_text) IS_WORKING = True for sentence in decoded_text: if '<END>' in sentence or not sentence[0].isupper( ) or not sentence[-1].isalpha(): IS_WORKING = False print(decoded_text) RESULT = IS_WORKING assert RESULT, 'Not working'