def test_detokenize(self): lst = ['oh', 'you', "can't", 'help', 'that', 'said', 'the', 'cat'] ref_text = "oh you can't help that said the cat" ref_text2 = text = "oh you can't help that said the cat " text = corpus.detokenize(lst) self.assertEquals(ref_text, text, "Should be same text") self.assertEquals(ref_text2, text, "Should be same text")
def generate(): file_name = input("Please enter a training set's filename:") file = open(file_name) n = int(input("Please input a n-grams value 'n':")) model = init(n) sequences = [] for line in file: tokens = corpus.tokenize(line) sequences.append(tokens) ''' train_result = lm.train(sequences) print(lm.train(sequences)) ''' model.train(sequences) # print(model.counts) new_text_list = model.generate() # print(lm.generate()) new_text = corpus.detokenize(new_text_list) return new_text
def generate_save(): new_text = '' filename = input('Please input a filename:') number = int(input('Please input number of desire text:')) file_name = input("Please enter a training set's filename:") file = open(file_name) n = int(input("Please input a n-grams value 'n':")) model = init(n) sequences = [] for line in file: tokens = (corpus.tokenize(line)) sequences.append(tokens) model.train(sequences) for i in range(0, number): new_text_list = model.generate() new_text += corpus.detokenize(new_text_list) + '\n' file = open(filename, 'w') file.write(new_text) file.close()
def test_short(self): self.assertEqual(corpus.detokenize(['.']), '.')
def test_generate(self): model = lm.LanguageModel(3) model.train([['This', 'is', 'an', 'apple', '.']]) self.assertTrue(type(corpus.detokenize(model.generate())) == str)
def main(): while True: print("Press 1 : Create a new language model with a user-specified n") print( "Press 2 : Load texts from a file, and train the language model on those texts" ) print( "Press 3 : Generate a text from the language model, and print it to the screen" ) print( "Press 4 : Generate a user-specified number of texts from the language model, and write them to a file" ) print( "Press 5 : Print the predicted next word's probability distribution" ) print("Press 6 : Perplexity of language model") print("Press 7 : Exit") print("Enter your choice (integer) ") text = input() if text == "1": print() print("Enter the value of n(integer value)") n = int(input()) c = lm.LanguageModel(n) print("The value for ngram language model is ", n, "gram model") elif text == "2": print() print("You have pressed 2") print("Enter the filename") filename = input() # filename = "dev_shakespeare.txt" # lst = c.load(filename) c.load(filename) # print(lst) # c.train(lst) # print((c.counts)) elif text == "3": print() print("You have pressed 3 ") print("Generate a random text") print(corpus.detokenize(c.generate())) elif text == "4": print() print("You have pressed 4 ") print("Enter the number for how many random texts you want") number_random = int(input()) print("Enter the filename you want to save for random text") filename = input() file = open(filename, "w") while True: if number_random == 0: break file.write(corpus.detokenize(c.generate()) + "\n") number_random -= 1 file.close() # print(c.generate()) elif text == "5": print() print("You have pressed 5 ") print( "Enter the text and predict the next word's probability distribution" ) # s = "venture forth, The better part of my affections" s = input().lower() print(c.p_next(corpus.tokenize(s))) elif text == "6": print() print("You have pressed 6 ") print("Perplexity of the current language model is ", round(c.perplexity())) elif text == "7": print() print("You have pressed 7 for exit") # for x in c.pdf: # print(x, c.pdf[x]) # # print(len(c.pdf)) print("Exiting the main program") sys.exit(0) else: print( "Incorrect input. Please enter correct input for selecting option" )
def test_empty(self): self.assertEqual(corpus.detokenize([]), '')
def generate5(mdl): with open('new_shakespeare.txt', 'w') as f: for i in range(5): f.write(cp.detokenize(mdl.generate())) f.write('\n\n')
def generate(mdl): print('\n' + cp.detokenize(mdl.generate()))
def test_detokenize(self): print("id: " + self.id()) result_text = "Simple array for testing detokenization" self.assertEqual(corpus.detokenize(self.input_tokens), result_text)
def test_detokenize_produces_expected_tokens(tokens, detokenized): assert [detokenize(tokens)] == detokenized
def test_detokenize_handles_arbitrary_texts(tokens): assert [detokenize(tokens)]