def test_ngram_text_generator_bad_num_input(self): # new test """ throws errors with bad inputs """ bad_inputs = (-5, 0, -2, -1, -8) corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>') word_storage = WordStorage() word_storage.update(corpus) encoded = encode_text(word_storage, corpus) trie = NGramTrie(2, encoded) generator = NGramTextGenerator(word_storage, trie) for bad_input in bad_inputs: self.assertRaises(ValueError, generator._generate_sentence, bad_input)
def test_ngram_text_generator_throws_errors(self): """ throws errors with bad inputs """ bad_inputs = [[], {}, None, 9, 9.34, True] corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>') word_storage = WordStorage() word_storage.update(corpus) encoded = encode_text(word_storage, corpus) trie = NGramTrie(2, encoded) generator = NGramTextGenerator(word_storage, trie) for bad_input in bad_inputs: self.assertRaises(ValueError, generator._generate_sentence, bad_input)
def test_get_most_frequent_gram_bad_inputs(self): """ Checks that method returns empty tuple """ corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>') word_storage = WordStorage() word_storage.update(corpus) encoded = encode_text(word_storage, corpus) ngram = NGramTrie(3, encoded) expected = () generator = NGramTextGenerator(word_storage, ngram) bad_inputs = [[], {}, (), None, 9, 9.34, True] for bad_input in bad_inputs: self.assertEqual(expected, generator.get_most_frequent_gram(bad_input))
def test_ngram_text_generator_generate_next_word(self): """ Checks that next word generates properly """ corpus = ('i', 'have', 'a', 'cat', '<END>', 'i', 'have', 'a', 'bruno', 'cat', '<END>') word_storage = WordStorage() word_storage.update(corpus) encoded = encode_text(word_storage, corpus) ngram = NGramTrie(3, encoded) generator = NGramTextGenerator(word_storage, ngram) context = (word_storage.get_id('i'), word_storage.get_id('have')) expected = word_storage.get_id('a') actual = generator._generate_next_word(context) self.assertEqual(expected, actual)
def test_ngram_text_generator_generate_next_word_incorrect_context(self): """ Checks that method throws error """ corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>') word_storage = WordStorage() word_storage.update(corpus) encoded = encode_text(word_storage, corpus) ngram = NGramTrie(3, encoded) generator = NGramTextGenerator(word_storage, ngram) bad_inputs = [[], {}, (3, ), None, 9, 9.34, True] # (3, ) - it is incorrect sized ngram for bad_input in bad_inputs: self.assertRaises(ValueError, generator._generate_next_word, bad_input)
def test_ngram_text_generator_generate_sentence_properly(self): """ generates correct output according to simple case """ corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>') word_storage = WordStorage() word_storage.update(corpus) encoded = encode_text(word_storage, corpus) trie = NGramTrie(2, encoded) context = (word_storage.get_id('i'), ) end = word_storage.get_id('<END>') generator = NGramTextGenerator(word_storage, trie) actual = generator._generate_sentence(context) self.assertEqual(actual[-1], end)
def test_get_most_frequent_gram_ideal(self): """ Checks that most frequent ngram gets properly """ corpus = ('i', 'have', 'a', 'cat', '<END>', 'i', 'have', 'a', 'bruno', 'cat', '<END>') word_storage = WordStorage() word_storage.update(corpus) encoded = encode_text(word_storage, corpus) ngram = NGramTrie(3, encoded) generator = NGramTextGenerator(word_storage, ngram) context = (word_storage.get_id('i'), word_storage.get_id('have')) expected = (word_storage.get_id('i'), word_storage.get_id('have'), word_storage.get_id('a')) actual = generator.get_most_frequent_gram(context) self.assertEqual(expected, actual)
def test_decode_text_upper_first_letter(self): ''' Tests that number all the letters except first one in a sentence are in a lower case ''' corpus = ('first', 'sentence', 'here', '<END>', 'second', 'sentence', 'here', '<END>', 'third', 'sentence', 'here', '<END>') storage = WordStorage() storage.update(corpus) encoded_text = encode_text(storage, corpus) trie = NGramTrie(3, encoded_text) context = (storage.get_id('first'), storage.get_id('sentence')) likelihood_generator = LikelihoodBasedTextGenerator(storage, trie) generated_encoded_text = likelihood_generator.generate_text(context, 1) decoded_text = decode_text(storage, generated_encoded_text) self.assertFalse(decoded_text[0][1:].isupper())
def test_ngram_text_generator_generate_sentence_no_end(self): """ should generate '<END>' anyway """ corpus = ('i', 'have', 'a', 'cat', 'his', 'name', 'is', 'bruno', 'i', 'have', 'a', 'dog', 'too', 'his', 'name', 'is', 'rex', 'her', 'name', 'is', 'rex', 'too', '<END>') word_storage = WordStorage() word_storage.update(corpus) encoded = encode_text(word_storage, corpus) trie = NGramTrie(2, encoded) context = (word_storage.get_id('cat'), ) generator = NGramTextGenerator(word_storage, trie) actual = generator._generate_sentence(context) expected = '<END>' actual = word_storage.get_word(actual[-1]) self.assertEqual(expected, actual)
def test_get_most_frequent_gram_no_such_context(self): """ Checks that returns empty tuple with no context in the corpus """ corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>') word_storage = WordStorage() word_storage.update(corpus) encoded = encode_text(word_storage, corpus) ngram = NGramTrie(3, encoded) generator = NGramTextGenerator(word_storage, ngram) context = ( word_storage.get_id('i'), word_storage.get_id('name'), ) # there is no such context in ngrams expected = () actual = generator.get_most_frequent_gram(context) self.assertEqual(expected, actual)
def test_load_model_has_generator_methods(self): corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(2, encoded) generator = NGramTextGenerator(storage, trie) save_model(generator, 'my_awesome_model') loaded_model = load_model('my_awesome_model') self.assertEquals(dir(loaded_model), dir(generator))
def test_save_model_ideal(self): corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(2, encoded) generator = NGramTextGenerator(storage, trie) save_model(generator, 'my_awesome_model') with open('my_awesome_model', 'r', encoding='utf-8') as file_to_read: data = file_to_read.read() self.assertTrue(data)
def test_generate_next_word_incorrect_context(self): corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(3, encoded) bad_inputs = [[], {}, (2000, ), None, 9, 9.34, True] generator = LikelihoodBasedTextGenerator(storage, trie) for bad_context in bad_inputs: self.assertRaises(ValueError, generator._generate_next_word, bad_context)
def test_text_generator_generate_sentence_proper_number_of_end(self): """ Checks that class creates correct sentence with only one <END> """ corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'there', 'are', 'a', 'cat', 'outside', '<END>', 'here', 'is', 'a', 'cat', 'outside', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(3, encoded) context = (storage.get_id('a'), storage.get_id('is'), storage.get_id('<END>')) generator = NGramTextGenerator(storage, trie) actual = generator._generate_sentence(context) self.assertEqual(1, actual.count(storage.get_id('<END>')))
def test_float_result(self): """ Checks that returned result is float """ corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(2, encoded) context = (storage.get_id('i'), ) word = storage.get_id('have') generator = LikelihoodBasedTextGenerator(storage, trie) actual = generator._calculate_maximum_likelihood(word, context) self.assertEqual(float, type(actual))
def test_ngram_text_generator_generate_next_word_no_such_context(self): """ Checks that next word generates properly if no context found """ corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>') word_storage = WordStorage() word_storage.update(corpus) encoded = encode_text(word_storage, corpus) ngram = NGramTrie(3, encoded) generator = NGramTextGenerator(word_storage, ngram) context = ( word_storage.get_id('i'), word_storage.get_id('name'), ) # there is no such context in ngrams, so return most frequent option expected_top_freq = word_storage.get_id('<END>') # as it appears twice actual = generator._generate_next_word(context) self.assertEqual(expected_top_freq, actual)
def test_ngram_text_generator_generate_sentence_ideal(self): """ first and last generated words as expected """ corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(2, encoded) context = (storage.get_id('i'), ) first_generated = storage.get_id('have') last_generated = storage.get_id('<END>') generator = NGramTextGenerator(storage, trie) actual = generator._generate_sentence(context) self.assertEqual(actual[1], first_generated) self.assertEqual(actual[-1], last_generated)
def test_generate_text_ideal(self): """ should generate simple case with three sentences out of small corpus """ corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(2, encoded) generator = NGramTextGenerator(storage, trie) context = (storage.get_id('bruno'), ) end = storage.get_id('<END>') actual = generator.generate_text(context, 3) self.assertEqual(actual.count(end), 3)
def test_ngram_text_generator_end_at_the_beginning(self): """" should generate a sentence without <END> in any other position except the end of the sentence """ corpus = ('i', 'like', 'to', 'read', '<END>', 'he', 'likes', 'to', 'read', 'too', 'i', 'like', 'a', 'book', 'called', '"Harry Potter"', '<END>', 'he', 'likes', 'another', 'book', '<END>', 'he', 'does', 'not', 'tell', 'me', 'name', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(2, encoded) context = (storage.get_id('<END>'), ) last_generated = storage.get_id('<END>') generator = NGramTextGenerator(storage, trie) actual = generator._generate_sentence(context) self.assertEqual(last_generated, actual[-1]) self.assertEqual(1, actual.count(storage.get_id('<END>')))
def test_save_model_incorrect(self): """ check for save_model function with incorrect inputs """ corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(2, encoded) generator = NGramTextGenerator(storage, trie) bad_inputs = ((), [], {}, 123, None, WordStorage) for bad_input in bad_inputs: self.assertRaises(ValueError, save_model, generator, bad_input)
def test_ngram_trie_unigrams(self): """ Checks that class creates correct unigrams """ corpus = (1, 2, 3, 4, 5, 6, 7, 8, 9, 5, 1, 2, 3, 10, 11, 5, 6, 7, 8, 12, 5, 13, 7, 8, 12, 11, 5) instance = NGramTrie(2, corpus) unigrams = { (1, ): 2, (2, ): 2, (3, ): 2, (4, ): 1, (5, ): 5, (6, ): 2, (7, ): 3, (8, ): 3, (9, ): 1, (10, ): 1, (11, ): 2, (12, ): 2, (13, ): 1 } self.assertEqual(unigrams, instance.uni_grams)
def test_generate_next_word_no_context(self): """ Checks that next word generates even if context isn't found """ corpus = ('he', 'likes', 'a', 'cat', 'but', 'he', 'does', 'not', 'like', 'parrots', '<END>', 'he', 'says', 'that', 'his', 'name', 'is', 'bruno', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(3, encoded) context = ( storage.get_id('cat'), storage.get_id('is'), ) generator = LikelihoodBasedTextGenerator(storage, trie) expected_most_frequent = storage.get_id('he') actual = generator._generate_next_word(context) self.assertEqual(expected_most_frequent, actual)
def test_calculate_likelihood_incorrect_context(self): corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(2, encoded) bad_inputs = [[], {}, ( 2000, 1000, ), None, 9, 9.34, True] # (2000, 1000, ) -> context for three gram word = storage.get_id('dog') generator = LikelihoodBasedTextGenerator(storage, trie) for bad_context in bad_inputs: self.assertRaises(ValueError, generator._calculate_maximum_likelihood, word, bad_context)
def test_calculate_likelihood_incorrect_word(self): corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(2, encoded) bad_inputs = [(), [], None, 123] context = ( storage.get_id('have'), storage.get_id('a'), ) generator = LikelihoodBasedTextGenerator(storage, trie) for bad_word in bad_inputs: self.assertRaises(ValueError, generator._calculate_maximum_likelihood, bad_word, context)
def test_calculate_likelihood_ideal(self): corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(3, encoded) word = storage.get_id('dog') context = ( storage.get_id('have'), storage.get_id('a'), ) generator = LikelihoodBasedTextGenerator(storage, trie) expected = 1 / 2 actual = generator._calculate_maximum_likelihood(word, context) self.assertEqual(expected, actual)
def test_generate_next_word_same_prob(self): corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(3, encoded) expected_word = storage.get_id('cat') context = ( storage.get_id('have'), storage.get_id('a'), ) generator = LikelihoodBasedTextGenerator(storage, trie) actual = generator._generate_next_word(context) self.assertEqual(expected_word, actual)
def test_text_generator_generate_sentence_includes_context(self): """ Checks that class creates correct sentence which starts with context (if <END> not in context) """ corpus = ('i', 'have', 'a', 'cat', 'and', 'a', 'dog', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'there', 'are', 'a', 'cat', 'and', 'a', 'bear', 'outside', '<END>', 'here', 'is', 'a', 'cat', 'outside', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(3, encoded) generator = NGramTextGenerator(storage, trie) context = (storage.get_id('a'), storage.get_id('cat')) actual = generator._generate_sentence(context) self.assertEqual(context, actual[:len(context)])
""" Lab 4 implementation starter """ from lab_4.main import BackOffGenerator, encode_text, WordStorage, decode_text, tokenize_by_sentence from lab_4.ngrams.ngram_trie import NGramTrie if __name__ == '__main__': with open('lab_3/Frank_Baum.txt', 'r', encoding='utf-8') as file_frank: corpus = tokenize_by_sentence(file_frank.read()) storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(3, encoded) four = NGramTrie(4, encoded) context = ( storage.get_id('when'), storage.get_id('the'), ) generator = BackOffGenerator(storage, four, trie) generated_text = generator.generate_text(context, 5) RESULT = decode_text(storage, generated_text) # DO NOT REMOVE NEXT LINE - KEEP IT INTENTIONALLY LAST assert RESULT, 'Encoding not working'
if __name__ == '__main__': TEXT = """I have a cat. His name is Max. I have a dog. His name is Jake. I do not have a dog or a cat. I have a parrot. Her name is Max too. And I have a parrot too. But his name is Leo.""" corpus = lab_4.main.tokenize_by_sentence(TEXT) storage = lab_4.main.WordStorage() storage.update(corpus) encoded_text = lab_4.main.encode_text(storage, corpus) ngram_trie_2 = NGramTrie(2, encoded_text) ngram_trie_3 = NGramTrie(3, encoded_text) ngram_trie_4 = NGramTrie(4, encoded_text) expected_word = storage.get_id('a') CONTEXT = (storage.get_id('i'), storage.get_id('have'),) generator_ngram_trie = lab_4.main.NGramTextGenerator(storage, ngram_trie_3) generator_likelihood = lab_4.main.LikelihoodBasedTextGenerator(storage, ngram_trie_3) generator_backoff = lab_4.main.BackOffGenerator(storage, ngram_trie_3, ngram_trie_2, ngram_trie_4) generated_text_1 = lab_4.main.decode_text(storage, generator_ngram_trie.generate_text(CONTEXT, 3)) generated_text_2 = lab_4.main.decode_text(storage, generator_likelihood.generate_text(CONTEXT, 3)) generated_text_3 = lab_4.main.decode_text(storage, generator_backoff.generate_text(CONTEXT, 3))
from lab_4.main import tokenize_by_sentence, WordStorage, encode_text, NGramTextGenerator from lab_4.ngrams.ngram_trie import NGramTrie if __name__ == '__main__': TEXT = 'I like flowers.\nMy mom likes flowers too.\nHer favourite flower is rose.\nMy favourite flower is rose too.' corpus = tokenize_by_sentence(TEXT) word_storage = WordStorage() word_storage.update(corpus) encoded_corpus = encode_text(word_storage, corpus) ngrams = NGramTrie(2, encoded_corpus) text_generator = NGramTextGenerator(word_storage, ngrams) gen_text = text_generator.generate_text((1, ), 2) end = word_storage.get_id('<END>') actual = gen_text.count(end) RESULT = 2 print(actual) assert RESULT == actual, 'not working'