def test_generate_sent_1gram(self): ngram = NGram(1, self.sents) generator = NGramGenerator(ngram) voc = {'el', 'gato', 'come', 'pescado', '.', 'la', 'gata', 'salmón'} for i in range(100): sent = generator.generate_sent() self.assertTrue(set(sent).issubset(voc))
def test_generate_token(self): ngram = NGram(2, self.sents) generator = NGramGenerator(ngram) for i in range(100): # after 'el' always comes 'gato': token = generator.generate_token(('el',)) self.assertEqual(token, 'gato') # after 'come' may come 'pescado' or 'salmón' token = generator.generate_token(('come',)) self.assertTrue(token in ['pescado', 'salmón'])
def test_generate_token(self): ngram = NGram(2, self.sents) generator = NGramGenerator(ngram) for i in range(100): # after 'el' always comes 'gato': token = generator.generate_token(('el', )) self.assertEqual(token, 'gato') # after 'come' may come 'pescado' or 'salmón' token = generator.generate_token(('come', )) self.assertTrue(token in ['pescado', 'salmón'])
def test_generate_sent_2gram(self): ngram = NGram(2, self.sents) generator = NGramGenerator(ngram) # all the possible generated sentences for 2-grams: sents = [ 'el gato come pescado .', 'la gata come salmón .', 'el gato come salmón .', 'la gata come pescado .', ] for i in range(100): sent = generator.generate_sent() self.assertTrue(' '.join(sent) in sents, sent)
def test_init_3gram(self): ngram = NGram(3, self.sents) generator = NGramGenerator(ngram) probs = { ('<s>', '<s>'): {'el': 0.5, 'la': 0.5}, ('<s>', 'el',): {'gato': 1.0}, ('el', 'gato'): {'come': 1.0}, ('gato', 'come'): {'pescado': 1.0}, ('come', 'pescado'): {'.': 1.0}, ('pescado', '.'): {'</s>': 1.0}, ('<s>', 'la'): {'gata': 1.0}, ('la', 'gata'): {'come': 1.0}, ('gata', 'come'): {'salmón': 1.0}, ('come', 'salmón'): {'.': 1.0}, ('salmón', '.'): {'</s>': 1.0}, } sorted_probs = { ('<s>', '<s>'): [('el', 0.5), ('la', 0.5)], ('<s>', 'el',): [('gato', 1.0)], ('el', 'gato'): [('come', 1.0)], ('gato', 'come'): [('pescado', 1.0)], ('come', 'pescado'): [('.', 1.0)], ('pescado', '.'): [('</s>', 1.0)], ('<s>', 'la'): [('gata', 1.0)], ('la', 'gata'): [('come', 1.0)], ('gata', 'come'): [('salmón', 1.0)], ('come', 'salmón'): [('.', 1.0)], ('salmón', '.'): [('</s>', 1.0)], } self.assertEqual(dict(generator.probs), probs) self.assertEqual(generator.sorted_probs, sorted_probs)
def test_generate_sent_3and4gram(self): ngram = NGram(3, self.sents4) ngram2 = NGram(4, self.sents4) generator = NGramGenerator(ngram) generator2 = NGramGenerator(ngram2) # all the possible generated sentences for 3 or 4-grams: sents = [ 'la casa se construye y el corre y la gata come ensalada', 'el corre y la gata come pescado y duerme', 'la casa se construye y el corre y la gata come ensalada', 'la casa se construye y el corre y la gata come pescado y duerme', 'la casa se construye y el corre', 'la gata come pescado y duerme', 'el corre y la gata come ensalada', 'el corre', 'la gata come ensalada', 'la casa se construye y el corre', 'la gata come pescado y duerme', ] for i in range(1000): sent = generator.generate_sent() sent2 = generator2.generate_sent() self.assertTrue(' '.join(sent) in sents) self.assertTrue(' '.join(sent2) in sents)
def test_generate_token_3and4gram(self): ngram = NGram(3, self.sents3) ngram2 = NGram(4, self.sents3) generator = NGramGenerator(ngram) generator2 = NGramGenerator(ngram2) for i in range(100): # after 'come pescado' always comes 'y' token = generator.generate_token(('come', 'pescado')) self.assertEqual(token, 'y') # after 'come pescado y' always comes 'duerme' token = generator2.generate_token(('come', 'pescado', 'y')) self.assertEqual(token, 'duerme') # sentence may come start with 'el' or 'la' token = generator.generate_token(('<s>', '<s>')) self.assertTrue(token in ['el', 'la']) token = generator2.generate_token(('<s>', '<s>', '<s>')) self.assertTrue(token in ['el', 'la'])
def test_init_1gram(self): ngram = NGram(1, self.sents) generator = NGramGenerator(ngram) probs = { (): { 'el': 1 / 12.0, 'gato': 1 / 12.0, 'come': 2 / 12.0, 'pescado': 1 / 12.0, '.': 2 / 12.0, '</s>': 2 / 12.0, 'la': 1 / 12.0, 'gata': 1 / 12.0, 'salmón': 1 / 12.0, } } self.assertEqual(dict(generator.probs), probs)
Options: -i <file> Language model file. -n <n> Number of sentences to generate. -h --help Show this screen. """ import sys sys.path.append("../../") import pickle from docopt import docopt from languagemodeling.ngram import NGramGenerator if __name__ == '__main__': opts = docopt(__doc__) n = int(opts['-n']) i = str(opts['-i']) f = open(i, 'rb') model = pickle.load(f) generator = NGramGenerator(model) for _ in range(n): sent = generator.generate_sent() for token in sent: print(token, end=" ") print("\n")
n = int(opts['-n']) filename = opts['-i'] # the output will be written in test/output.txt file_output = open(os.path.join(DEFAULT_OUTPUT_DIR, 'output.txt'), 'w') if filename: # instance an n-gram object whith n={1,2,3,4} # open the model to read file_model = open(filename, 'rb') # ngram is a model trained. ngram = pickle.load(file_model) # close the file file_model.close() # an instance of NGramGenerator with ngram generator = NGramGenerator(ngram) print('have just upload') for _ in range(0, n): list_sentence = generator.generate_sent() # join list with spaces between word file_output.write(' '.join(list_sentence)) # put an EOL file_output.write('\r\n') else: for i in range(1, 5): # open the model to read n={1,2,3,4, 5, 6, 7, 8} file_model = open(str(i) + '-gram.txt', 'rb') # ngram is a model trained. ngram = pickle.load(file_model) file_model.close() # an instance of NGramGenerator with ngram
Generate natural language sentences using a language model. Usage: generate.py -i <file> -n <n> generate.py -h | --help Options: -i <file> Language model file. -n <n> Number of sentences to generate. -h --help Show this screen. """ import pickle from docopt import docopt from languagemodeling.ngram import NGramGenerator if __name__ == '__main__': opts = docopt(__doc__) # read options path = str(opts['-i']) n = int(opts['-n']) # open model file file = open(path, 'rb') # load model file model = pickle.load(file) # create generator generator = NGramGenerator(model) # print sentences while generate them. for _ in range(n): print(' '.join(generator.generate_sent()) + "\n")
-i <file> Language model file. -n <n> Number of sentences to generate. -h --help Show this screen. """ import sys sys.path.append("../../") import pickle from docopt import docopt from languagemodeling.ngram import NGramGenerator if __name__ == '__main__': opts = docopt(__doc__) n = int(opts['-n']) i = str(opts['-i']) f = open(i, 'rb') model = pickle.load(f) generator = NGramGenerator(model) for _ in range(n): sent = generator.generate_sent() for token in sent: print(token, end=" ") print("\n")
-h --help Show this screen. """ from docopt import docopt import pickle import os.path import sys # Add ../../ to PYTHONPATH sys.path.append( os.path.join( os.path.dirname(os.path.realpath(__file__)), os.pardir, os.pardir)) from languagemodeling.ngram import NGramGenerator if __name__ == '__main__': opts = docopt(__doc__) # load the model filename = opts['-i'] with open(filename, 'rb') as f: model = pickle.load(f) sys.stderr.write('Loaded model\n') # generate n = int(opts['-n']) generator = NGramGenerator(model) sys.stderr.write('Initialized generator\n') for i in range(n): print('Sentence %s:' % i) print(' '.join(generator.generate_sent()))
"""Generate natural language sentences using a language model. Usage: generate.py -i <file> -n <n> generate.py -h | --help Options: -i <file> Language model file. -n <n> Number of sentences to generate. -h --help Show this screen. """ from docopt import docopt import pickle from languagemodeling.ngram import NGramGenerator if __name__ == '__main__': opts = docopt(__doc__) filename = opts['-i'] f = open(filename, 'rb') model = pickle.load(f) f.close() generator = NGramGenerator(model) for _ in range(int(opts['-n'])): sent = ' '.join(generator.generate_sent()) print(sent) print("-------------------------------------------------------------")