def test_ngram_text_generator_instance_creation(self): """ Checks that class creates correct instance """ word_storage = WordStorage() ngram = NGramTrie(2, ()) generator = NGramTextGenerator(word_storage, ngram) self.assertEqual(generator._word_storage, word_storage) self.assertEqual(generator._n_gram_trie, ngram)
def test_ngram_text_generator_generate_sentence_no_end(self): """ should generate '<END>' anyway """ corpus = ('i', 'have', 'a', 'cat', 'his', 'name', 'is', 'bruno', 'i', 'have', 'a', 'dog', 'too', 'his', 'name', 'is', 'rex', 'her', 'name', 'is', 'rex', 'too', '<END>') word_storage = WordStorage() word_storage.update(corpus) encoded = encode_text(word_storage, corpus) trie = NGramTrie(2, encoded) context = (word_storage.get_id('cat'), ) generator = NGramTextGenerator(word_storage, trie) actual = generator._generate_sentence(context) expected = '<END>' actual = word_storage.get_word(actual[-1]) self.assertEqual(expected, actual)
def test_get_most_frequent_gram_no_such_context(self): """ Checks that returns empty tuple with no context in the corpus """ corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>') word_storage = WordStorage() word_storage.update(corpus) encoded = encode_text(word_storage, corpus) ngram = NGramTrie(3, encoded) generator = NGramTextGenerator(word_storage, ngram) context = ( word_storage.get_id('i'), word_storage.get_id('name'), ) # there is no such context in ngrams expected = () actual = generator.get_most_frequent_gram(context) self.assertEqual(expected, actual)
def test_text_generator_generate_sentence_proper_number_of_end(self): """ Checks that class creates correct sentence with only one <END> """ corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'there', 'are', 'a', 'cat', 'outside', '<END>', 'here', 'is', 'a', 'cat', 'outside', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(3, encoded) context = (storage.get_id('a'), storage.get_id('is'), storage.get_id('<END>')) generator = NGramTextGenerator(storage, trie) actual = generator._generate_sentence(context) self.assertEqual(1, actual.count(storage.get_id('<END>')))
def test_ngram_text_generator_generate_next_word_no_such_context(self): """ Checks that next word generates properly if no context found """ corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>') word_storage = WordStorage() word_storage.update(corpus) encoded = encode_text(word_storage, corpus) ngram = NGramTrie(3, encoded) generator = NGramTextGenerator(word_storage, ngram) context = ( word_storage.get_id('i'), word_storage.get_id('name'), ) # there is no such context in ngrams, so return most frequent option expected_top_freq = word_storage.get_id('<END>') # as it appears twice actual = generator._generate_next_word(context) self.assertEqual(expected_top_freq, actual)
def test_ngram_text_generator_generate_sentence_ideal(self): """ first and last generated words as expected """ corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(2, encoded) context = (storage.get_id('i'), ) first_generated = storage.get_id('have') last_generated = storage.get_id('<END>') generator = NGramTextGenerator(storage, trie) actual = generator._generate_sentence(context) self.assertEqual(actual[1], first_generated) self.assertEqual(actual[-1], last_generated)
def test_generate_text_ideal(self): """ should generate simple case with three sentences out of small corpus """ corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(2, encoded) generator = NGramTextGenerator(storage, trie) context = (storage.get_id('bruno'), ) end = storage.get_id('<END>') actual = generator.generate_text(context, 3) self.assertEqual(actual.count(end), 3)
def test_ngram_text_generator_end_at_the_beginning(self): """" should generate a sentence without <END> in any other position except the end of the sentence """ corpus = ('i', 'like', 'to', 'read', '<END>', 'he', 'likes', 'to', 'read', 'too', 'i', 'like', 'a', 'book', 'called', '"Harry Potter"', '<END>', 'he', 'likes', 'another', 'book', '<END>', 'he', 'does', 'not', 'tell', 'me', 'name', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(2, encoded) context = (storage.get_id('<END>'), ) last_generated = storage.get_id('<END>') generator = NGramTextGenerator(storage, trie) actual = generator._generate_sentence(context) self.assertEqual(last_generated, actual[-1]) self.assertEqual(1, actual.count(storage.get_id('<END>')))
def test_save_model_incorrect_path(self): corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(2, encoded) generator = NGramTextGenerator(storage, trie) self.assertRaises(FileNotFoundError, save_model, generator, r'some_folder/some_file')
def test_context_end(self): """ checks if <END> is in the context """ context = ('cat', '<END>') corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>') word_storage = WordStorage() word_storage.update(corpus) encoded = encode_text(word_storage, corpus) trie = NGramTrie(3, encoded) generator = NGramTextGenerator(word_storage, trie) self.assertRaises(ValueError, generator._generate_sentence, context)
def test_text_generator_generate_sentence_includes_context(self): """ Checks that class creates correct sentence which starts with context (if <END> not in context) """ corpus = ('i', 'have', 'a', 'cat', 'and', 'a', 'dog', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'there', 'are', 'a', 'cat', 'and', 'a', 'bear', 'outside', '<END>', 'here', 'is', 'a', 'cat', 'outside', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(3, encoded) generator = NGramTextGenerator(storage, trie) context = (storage.get_id('a'), storage.get_id('cat')) actual = generator._generate_sentence(context) self.assertEqual(context, actual[:len(context)])
def test_text_generator_throws_errors(self): """ throws errors with bad inputs """ bad_inputs = [[], {}, None, 9, 9.34, True] corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>') word_storage = WordStorage() word_storage.update(corpus) encoded = encode_text(word_storage, corpus) trie = NGramTrie(2, encoded) generator = NGramTextGenerator(word_storage, trie) for bad_input in bad_inputs: self.assertRaises(ValueError, generator.generate_text, bad_input, 10)
def test_ngram_text_generator_bad_num_input(self): # new test """ throws errors with bad inputs """ bad_inputs = (-5, 0, -2, -1, -8) corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>') word_storage = WordStorage() word_storage.update(corpus) encoded = encode_text(word_storage, corpus) trie = NGramTrie(2, encoded) generator = NGramTextGenerator(word_storage, trie) for bad_input in bad_inputs: self.assertRaises(ValueError, generator._generate_sentence, bad_input)
def test_ngram_text_generator_generate_next_word_incorrect_context(self): """ Checks that method throws error """ corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>') word_storage = WordStorage() word_storage.update(corpus) encoded = encode_text(word_storage, corpus) ngram = NGramTrie(3, encoded) generator = NGramTextGenerator(word_storage, ngram) bad_inputs = [[], {}, (3, ), None, 9, 9.34, True] # (3, ) - it is incorrect sized ngram for bad_input in bad_inputs: self.assertRaises(ValueError, generator._generate_next_word, bad_input)
def test_load_model_has_generator_methods(self): corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(2, encoded) generator = NGramTextGenerator(storage, trie) save_model(generator, 'my_awesome_model') loaded_model = load_model('my_awesome_model') self.assertEquals(dir(loaded_model), dir(generator))
def test_save_model_ideal(self): corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(2, encoded) generator = NGramTextGenerator(storage, trie) save_model(generator, 'my_awesome_model') with open('my_awesome_model', 'r', encoding='utf-8') as file_to_read: data = file_to_read.read() self.assertTrue(data)
def test_save_model_incorrect(self): """ check for save_model function with incorrect inputs """ corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(2, encoded) generator = NGramTextGenerator(storage, trie) bad_inputs = ((), [], {}, 123, None, WordStorage) for bad_input in bad_inputs: self.assertRaises(ValueError, save_model, generator, bad_input)
""" Text generator implementation starter """ from lab_4.main import WordStorage, NGramTrie, NGramTextGenerator from lab_4.main import tokenize_by_sentence, encode_text if __name__ == '__main__': # here goes your function calls first_text = open('lab_3/Frank_Baum.txt', encoding="utf-8") first_text_tokenized = tokenize_by_sentence(first_text.read()) word_storage = WordStorage() word_storage.update(first_text_tokenized) encoded = encode_text(word_storage, first_text_tokenized) n_gram_trie = NGramTrie(2, encoded) generator = NGramTextGenerator(word_storage, n_gram_trie) RESULT = generator.generate_text(encoded[16:17], 3) #print(RESULT) assert RESULT, "Not working"
""" Lab 4 implementation starter """ from lab_4.main import WordStorage, NGramTrie, NGramTextGenerator from lab_4.main import tokenize_by_sentence, encode_text if __name__ == '__main__': text = 'She has a house. He has a house too. Besides he has a car. My friend also he has a car. ' \ 'Seems like everyone has has a car, but me.' text_tokenized = tokenize_by_sentence(text) word_storage = WordStorage() word_storage.update(text_tokenized) encoded_text = encode_text(word_storage, text_tokenized) print(encoded_text) trie = NGramTrie(2, encoded_text) context = (word_storage.get_id('has'),) generator = NGramTextGenerator(word_storage, trie) actual = generator.generate_text(context, 4) RESULT = actual print(RESULT) # DO NOT REMOVE NEXT LINE - KEEP IT INTENTIONALLY LAST assert RESULT, "Something went wrong"
from lab_4.main import encode_text from lab_4.main import NGramTextGenerator if __name__ == '__main__': text = 'I have a cat. His name is Bruno. I have a dog too. ' \ 'His name is Rex. Her name is Rex too' corpus = tokenize_by_sentence(text) word_storage = WordStorage() word_storage.update(corpus) encoded_text = encode_text(word_storage, corpus) n_gram_trie = NGramTrie(2, encoded_text) n_gram_text_generator = NGramTextGenerator(word_storage, n_gram_trie) context = (word_storage.get_id('i'), word_storage.get_id('have')) text_generated = n_gram_text_generator.generate_text(context, 2) output_text = [] for word_id in text_generated: word = word_storage.get_word(word_id) if word != '<END>': output_text.append(word) RESULT = ' '.join(output_text) print(RESULT) assert RESULT == 'i have a cat name is rex', 'Something went wrong :('
""" Text generator implementation starter """ from lab_4.main import WordStorage, NGramTrie, NGramTextGenerator from lab_4.main import tokenize_by_sentence, encode_text if __name__ == '__main__': text = 'I have a cat. His name is Bruno. I have a dog. Her name is Rex. Her name is Rex too.' text_tokenized = tokenize_by_sentence(text) word_storage = WordStorage() word_storage.update(text_tokenized) encoded = encode_text(word_storage, text_tokenized) trie = NGramTrie(2, encoded) context = (word_storage.get_id('i'), ) generator = NGramTextGenerator(word_storage, trie) RESULT = generator.generate_text(context, 4) print(RESULT) assert RESULT, "Not working"
On Monday, I go to work. I work at the post office. Everyone shops for food at the grocery store. They also eat at the restaurant. The restaurant serves pizza and ice cream. My friends and I go to the park. We like to play soccer at the park. On Fridays, we go to the cinema to see a movie. Children don't go to school on the weekend. Each day, people go to the hospital when they are sick. The doctors and nurses take care of them in the city. The police keep everyone safe. I am happy to live in my city. """ corpus = tokenize_by_sentence(TEXT) word_storage = WordStorage() word_storage.update(corpus) encoded_corpus = encode_text(word_storage, corpus) ngrams = NGramTrie(3, encoded_corpus) generator = NGramTextGenerator(word_storage, ngrams) context = (word_storage.get_id('the'), word_storage.get_id('post')) RESULT = generator.generate_text(context, 1) print(' '.join([word_storage.get_word(word) for word in RESULT])) assert RESULT, 'Language generator work incorrect'
if __name__ == '__main__': text = tokenize_by_sentence( """Hi everyone! Nice to meet you again. What are you doing in my laboratory work? You are very nice person, do you know it? To be honest, I can't stand programming. But it doesn't depend on you! It's my personal problem and I don't know how to solve it... It doesn't matter right now""") word_storage = WordStorage() word_storage.update(text) encoded_text = encode_text(word_storage, text) n_gram_trie = NGramTrie(3, encoded_text) generator_of_text = NGramTextGenerator(word_storage, n_gram_trie) context = word_storage.get_id('on'), word_storage.get_id('you') formed_ids = generator_of_text.generate_text(context, 1) formed_text = [] for ids in formed_ids: word = word_storage.get_word(ids) if word != '<END>': formed_text.append(word) RESULT = ' '.join(formed_text) print(RESULT) assert RESULT == 'on you', ''
from lab_4.main import tokenize_by_sentence, WordStorage, encode_text, NGramTextGenerator from lab_4.ngrams.ngram_trie import NGramTrie if __name__ == '__main__': TEXT = 'I like flowers.\nMy mom likes flowers too.\nHer favourite flower is rose.\nMy favourite flower is rose too.' corpus = tokenize_by_sentence(TEXT) word_storage = WordStorage() word_storage.update(corpus) encoded_corpus = encode_text(word_storage, corpus) ngrams = NGramTrie(2, encoded_corpus) text_generator = NGramTextGenerator(word_storage, ngrams) gen_text = text_generator.generate_text((1, ), 2) end = word_storage.get_id('<END>') actual = gen_text.count(end) RESULT = 2 print(actual) assert RESULT == actual, 'not working'
from lab_4.main import decode_text from lab_4.main import NGramTextGenerator from lab_4.main import LikelihoodBasedTextGenerator from lab_4.main import BackOffGenerator if __name__ == '__main__': corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = ngrams.NGramTrie(2, encoded) context = (storage.get_id('i'), ) generator = NGramTextGenerator(storage, trie) actual = generator.generate_text(context, 5) actual = decode_text(storage, actual) print(actual) generator = LikelihoodBasedTextGenerator(storage, trie) actual = generator.generate_text(context, 5) actual = decode_text(storage, actual) print(actual) two = ngrams.NGramTrie(2, encoded) trie = ngrams.NGramTrie(3, encoded) context = ( storage.get_id('name'), storage.get_id('is'),
""" Lab 4 implementation starter """ from lab_4.main import tokenize_by_sentence, WordStorage, encode_text, NGramTextGenerator from lab_4.ngrams.ngram_trie import NGramTrie if __name__ == '__main__': text = 'I have a dog.\nHis name is Will' tokenize_text = tokenize_by_sentence(text) print(tokenize_text) storage = WordStorage() storage.update(tokenize_text) print(storage) encode = encode_text(storage, tokenize_text) print(encode) n_gram_trie = NGramTrie(2, encode) print(n_gram_trie) generator = NGramTextGenerator(storage, n_gram_trie) context = (storage.get_id('a'), ) print(context) RESULT = generator.generate_text(context, 3) print(RESULT) # DO NOT REMOVE NEXT LINE - KEEP IT INTENTIONALLY LAST assert RESULT, 'Not working'
from lab_4.main import tokenize_by_sentence from lab_4.main import WordStorage, LikelihoodBasedTextGenerator from lab_4.main import encode_text, decode_text from lab_4.main import NGramTextGenerator if __name__ == '__main__': TEXT = 'I have a cat. His name is Bruno. I have a dog too. His name is Rex. Her name is Rex too.' tokenized_text = tokenize_by_sentence(TEXT) word_storage = WordStorage() word_storage.update(tokenized_text) encoded = encode_text(word_storage, tokenized_text) trie = NGramTrie(3, encoded) context = ( word_storage.get_id('name'), word_storage.get_id('is'), ) generator = NGramTextGenerator(word_storage, trie) generated_text = generator.generate_text(context, 2) gen_likelihood = LikelihoodBasedTextGenerator(word_storage, trie) gen_text = gen_likelihood.generate_text(context, 2) decoded_text = decode_text(word_storage, gen_text) RESULT = decoded_text assert RESULT == ('Name is rex', 'Her name is rex'), "Not working"
from lab_4.ngrams.ngram_trie import NGramTrie from lab_4.main import tokenize_by_sentence from lab_4.main import WordStorage from lab_4.main import encode_text from lab_4.main import NGramTextGenerator if __name__ == '__main__': text = "This is a dog. It likes running. This is a cat. It likes sleeping. Everyone likes sleeping too." text_in_tokens = tokenize_by_sentence(text) word_storage = WordStorage() word_storage.update(text_in_tokens) encoded_text = encode_text(word_storage, text_in_tokens) n_gram_trie = NGramTrie(2, encoded_text) context = (word_storage.get_id('likes'),) text_generator = NGramTextGenerator(word_storage, n_gram_trie) RESULT = text_generator.generate_text(context, 4) print(RESULT) assert RESULT, "Someting went worng.."