def test_count_3gram(self): sents = [ 'el gato come pescado .'.split(), 'la gata come salmón .'.split(), 'unaria'.split(), ] ngram = NGram(3, sents) counts = { ('<s>', 'el'): 1, ('el', 'gato'): 1, ('gato', 'come'): 1, ('come', 'pescado'): 1, ('pescado', '.'): 1, ('<s>', 'la'): 1, ('la', 'gata'): 1, ('gata', 'come'): 1, ('come', 'salmón'): 1, ('salmón', '.'): 1, ('<s>', 'unaria'): 1, ('<s>', 'el', 'gato'): 1, ('el', 'gato', 'come'): 1, ('gato', 'come', 'pescado'): 1, ('come', 'pescado', '.'): 1, ('<s>', 'la', 'gata'): 1, ('la', 'gata', 'come'): 1, ('gata', 'come', 'salmón'): 1, ('come', 'salmón', '.'): 1, ('salmón', '.', '</s>'): 1, ('<s>', 'unaria', '</s>'): 1, } for gram, c in counts.items(): self.assertEqual(ngram.count(gram), c)
def test_count_3gram(self): ngram = NGram(3, self.sents) counts = { ('<s>', 'el'): 1, ('el', 'gato'): 1, ('gato', 'come'): 1, ('come', 'pescado'): 1, ('pescado', '.'): 1, ('<s>', '<s>'): 2, ('<s>', 'la'): 1, ('la', 'gata'): 1, ('gata', 'come'): 1, ('come', 'salmón'): 1, ('salmón', '.'): 1, ('<s>', '<s>', 'el'): 1, ('<s>', 'el', 'gato'): 1, ('el', 'gato', 'come'): 1, ('gato', 'come', 'pescado'): 1, ('come', 'pescado', '.'): 1, ('pescado', '.', '</s>'): 1, ('<s>', '<s>', 'la'): 1, ('<s>', 'la', 'gata'): 1, ('la', 'gata', 'come'): 1, ('gata', 'come', 'salmón'): 1, ('come', 'salmón', '.'): 1, ('salmón', '.', '</s>'): 1, } for gram, c in counts.items(): self.assertEqual(ngram.count(gram), c)
def test_generate_sent_3and4gram(self): ngram = NGram(3, self.sents4) ngram2 = NGram(4, self.sents4) generator = NGramGenerator(ngram) generator2 = NGramGenerator(ngram2) # all the possible generated sentences for 3 or 4-grams: sents = [ 'la casa se construye y el corre y la gata come ensalada', 'el corre y la gata come pescado y duerme', 'la casa se construye y el corre y la gata come ensalada', 'la casa se construye y el corre y la gata come pescado y duerme', 'la casa se construye y el corre', 'la gata come pescado y duerme', 'el corre y la gata come ensalada', 'el corre', 'la gata come ensalada', 'la casa se construye y el corre', 'la gata come pescado y duerme', ] for i in range(1000): sent = generator.generate_sent() sent2 = generator2.generate_sent() self.assertTrue(' '.join(sent) in sents) self.assertTrue(' '.join(sent2) in sents)
def test_count_2gram(self): ngram = NGram(2, self.sents) counts = { ('<s>',): 2, ('el',): 1, ('gato',): 1, ('come',): 2, ('pescado',): 1, ('.',): 2, ('la',): 1, ('gata',): 1, ('salmón',): 1, ('<s>', 'el'): 1, ('el', 'gato'): 1, ('gato', 'come'): 1, ('come', 'pescado'): 1, ('pescado', '.'): 1, ('.', '</s>'): 2, ('<s>', 'la'): 1, ('la', 'gata'): 1, ('gata', 'come'): 1, ('come', 'salmón'): 1, ('salmón', '.'): 1, } for gram, c in counts.items(): self.assertEqual(ngram.count(gram), c)
def test_cond_prob_2gram(self): ngram = NGram(2, self.sents) probs = { ('pescado', 'come'): 0.5, ('salmón', 'come'): 0.5, ('salame', 'come'): 0.0, } for (token, prev), p in probs.items(): self.assertAlmostEqual(ngram.cond_prob(token, (prev,)), p)
def test_cond_prob_1gram(self): ngram = NGram(1, self.sents) probs = { 'pescado': 1 / 12.0, 'come': 2 / 12.0, 'salame': 0.0, } for token, p in probs.items(): self.assertEqual(ngram.cond_prob(token), p)
def test_cond_prob_1gram(self): ngram = NGram(1, self.sents) probs = { 'pescado': 1 / 12.0, 'come': 2 / 12.0, 'salame': 0.0, } for token, p in probs.items(): self.assertAlmostEqual(ngram.cond_prob(token), p)
def test_cond_prob_2gram(self): ngram = NGram(2, self.sents) probs = { ('pescado', 'come'): 0.5, ('salmón', 'come'): 0.5, ('salame', 'come'): 0.0, } for (token, prev), p in probs.items(): self.assertEqual(ngram.cond_prob(token, [prev]), p)
def test_sent_prob_1gram(self): ngram = NGram(1, self.sents) sents = { # 'come', '.' and '</s>' have prob 1/6, the rest have 1/12. 'el gato come pescado .': (1 / 6.0)**3 * (1 / 12.0)**3, 'la gata come salmón .': (1 / 6.0)**3 * (1 / 12.0)**3, 'el gato come salame .': 0.0, # 'salame' unseen 'la la la': (1 / 6.0)**1 * (1 / 12.0)**3, } for sent, prob in sents.items(): self.assertAlmostEqual(ngram.sent_prob(sent.split()), prob, msg=sent)
def test_sent_log_prob_1gram(self): ngram = NGram(1, self.sents) def log2(x): return log(x, 2) sents = { # 'come', '.' and '</s>' have prob 1/6, the rest have 1/12. 'el gato come pescado .': 3 * log2(1 / 6.0) + 3 * log2(1 / 12.0), 'la gata come salmón .': 3 * log2(1 / 6.0) + 3 * log2(1 / 12.0), 'el gato come salame .': -inf, # 'salame' unseen 'la la la': log2(1 / 6.0) + 3 * log2(1 / 12.0), } for sent, prob in sents.items(): self.assertAlmostEqual(ngram.sent_log_prob(sent.split()), prob, msg=sent)
def test_cond_prob_3gram(self): ngram = NGram(3, self.sents) probs = { ('pescado', ('gato', 'come')): 1, ('salmón', ('gata', 'come')): 1, ('salame', ('gato', 'come')): 0.0, ('gato', ('<s>', 'el')): 1, ('gata', ('<s>', 'el')): 0.0, } for (token, prev), p in probs.items(): self.assertEqual(ngram.cond_prob(token, list(prev)), p)
def test_sent_prob_3gram(self): ngram = NGram(3, self.sents) sents = { # after '<s>, <s>': 'el' and 'la' have prob 0.5. 'el gato come pescado .': 0.5 * 1 * 1 * 1 * 1, 'la gata come salmón .': 0.5 * 1 * 1 * 1 * 1, 'el gato come salmón .': 0.5 * 1 * 1 * 0 * 1 * 1, # prob('gato come salmon') = 0 'el gato come salame .': 0.0, # 'salame' unseen 'la la la': 0.0, # 'la' after 'la' unseen } for sent, prob in sents.items(): self.assertAlmostEqual(ngram.sent_prob(sent.split()), prob, msg=sent)
def test_cond_prob_3gram(self): ngram = NGram(3, self.sents) probs = { ('pescado', 'gato', 'come'): 1, ('salmón', 'gata', 'come'): 1, ('salame', 'gato', 'come'): 0.0, ('salmón', 'gato', 'come'): 0.0, ('el', '<s>', '<s>'): 0.5, ('</s>', 'pescado', '.'): 1, } for (token, prev1, prev2), p in probs.items(): self.assertEqual(ngram.cond_prob(token, [prev1, prev2]), p)
def test_sent_log_prob_1gram(self): ngram = NGram(1, self.sents) log2 = lambda x: log(x, 2) sents = { # 'come', '.' and '</s>' have prob 1/6, the rest have 1/12. 'el gato come pescado .': 3 * log2(1 / 6.0) + 3 * log2(1 / 12.0), 'la gata come salmón .': 3 * log2(1 / 6.0) + 3 * log2(1 / 12.0), 'el gato come salame .': float('-inf'), # 'salame' unseen 'la la la': log2(1 / 6.0) + 3 * log2(1 / 12.0), } for sent, prob in sents.items(): self.assertAlmostEqual(ngram.sent_log_prob(sent.split()), prob, msg=sent)
def test_cond_prob_4gram(self): ngram = NGram(4, self.sents2) probs = { ('pescado', ('el', 'gato', 'come')): 1, ('salmón', ('la', 'gata', 'come')): 0.0, ('salame', ('el', 'gato', 'come')): 0.0, ('gato', ('<s>', '<s>', 'el')): 1, ('viejo', ('gato', 'come', 'pescado')): 0.5, ('fresco', ('gato', 'come', 'pescado')): 0.5, } for (token, prev), p in probs.items(): self.assertEqual(ngram.cond_prob(token, list(prev)), p)
def test_sent_prob_2gram(self): ngram = NGram(2, self.sents) sents = { # after '<s>': 'el' and 'la' have prob 0.5. # after 'come': 'pescado' and 'salmón' have prob 0.5. 'el gato come pescado .': 0.5 * 0.5, 'la gata come salmón .': 0.5 * 0.5, 'el gato come salmón .': 0.5 * 0.5, 'la gata come pescado .': 0.5 * 0.5, 'el gato come salame .': 0.0, # 'salame' unseen 'la la la': 0.0, # 'la' after 'la' unseen } for sent, prob in sents.items(): self.assertAlmostEqual(ngram.sent_prob(sent.split()), prob, msg=sent)
def test_sent_log_prob_2gram(self): ngram = NGram(2, self.sents) log2 = lambda x: log(x, 2) sents = { # after '<s>': 'el' and 'la' have prob 0.5. # after 'come': 'pescado' and 'salmón' have prob 0.5. 'el gato come pescado .': 2 * log2(0.5), 'la gata come salmón .': 2 * log2(0.5), 'el gato come salmón .': 2 * log2(0.5), 'la gata come pescado .': 2 * log2(0.5), 'el gato come salame .': float('-inf'), # 'salame' unseen 'la la la': float('-inf'), # 'la' after 'la' unseen } for sent, prob in sents.items(): self.assertAlmostEqual(ngram.sent_log_prob(sent.split()), prob, msg=sent)
def test_init_3gram(self): ngram = NGram(3, self.sents) generator = NGramGenerator(ngram) probs = { ('<s>', '<s>'): {'el': 0.5, 'la': 0.5}, ('<s>', 'el',): {'gato': 1.0}, ('el', 'gato'): {'come': 1.0}, ('gato', 'come'): {'pescado': 1.0}, ('come', 'pescado'): {'.': 1.0}, ('pescado', '.'): {'</s>': 1.0}, ('<s>', 'la'): {'gata': 1.0}, ('la', 'gata'): {'come': 1.0}, ('gata', 'come'): {'salmón': 1.0}, ('come', 'salmón'): {'.': 1.0}, ('salmón', '.'): {'</s>': 1.0}, } sorted_probs = { ('<s>', '<s>'): [('el', 0.5), ('la', 0.5)], ('<s>', 'el',): [('gato', 1.0)], ('el', 'gato'): [('come', 1.0)], ('gato', 'come'): [('pescado', 1.0)], ('come', 'pescado'): [('.', 1.0)], ('pescado', '.'): [('</s>', 1.0)], ('<s>', 'la'): [('gata', 1.0)], ('la', 'gata'): [('come', 1.0)], ('gata', 'come'): [('salmón', 1.0)], ('come', 'salmón'): [('.', 1.0)], ('salmón', '.'): [('</s>', 1.0)], } self.assertEqual(dict(generator.probs), probs) self.assertEqual(generator.sorted_probs, sorted_probs)
def test_sent_log_prob_2gram(self): ngram = NGram(2, self.sents) def log2(x): return log(x, 2) sents = { # after '<s>': 'el' and 'la' have prob 0.5. # after 'come': 'pescado' and 'salmón' have prob 0.5. 'el gato come pescado .': 2 * log2(0.5), 'la gata come salmón .': 2 * log2(0.5), 'el gato come salmón .': 2 * log2(0.5), 'la gata come pescado .': 2 * log2(0.5), 'el gato come salame .': -inf, # 'salame' unseen 'la la la': -inf, # 'la' after 'la' unseen } for sent, prob in sents.items(): self.assertAlmostEqual(ngram.sent_log_prob(sent.split()), prob, msg=sent)
def test_count_1gram(self): ngram = NGram(1, self.sents) counts = { (): 12, ('el',): 1, ('gato',): 1, ('come',): 2, ('pescado',): 1, ('.',): 2, ('</s>',): 2, ('la',): 1, ('gata',): 1, ('salmón',): 1, } for gram, c in counts.items(): self.assertEqual(ngram.count(gram), c)
def test_sent_log_prob_3gram(self): ngram = NGram(3, self.sents) log2 = lambda x: log(x, 2) sents = { # after '<s>': 'el' and 'la' have prob 0.5. # after 'come': 'pescado' and 'salmón' have prob 0.5. 'el gato come pescado .': log2(0.5), 'la gata come salmón .': log2(0.5), 'el gato come salmón .': float('-inf'), 'la gata come pescado .': float('-inf'), 'el gato come salame .': float('-inf'), # 'salame' unseen 'la la la': float('-inf'), # 'la' after 'la' unseen } for sent, prob in sents.items(): self.assertAlmostEqual(ngram.sent_log_prob(sent.split()), prob, msg=sent)
def test_generate_sent_1gram(self): ngram = NGram(1, self.sents) generator = NGramGenerator(ngram) voc = {'el', 'gato', 'come', 'pescado', '.', 'la', 'gata', 'salmón'} for i in range(100): sent = generator.generate_sent() self.assertTrue(set(sent).issubset(voc))
def test_generate_token_3and4gram(self): ngram = NGram(3, self.sents3) ngram2 = NGram(4, self.sents3) generator = NGramGenerator(ngram) generator2 = NGramGenerator(ngram2) for i in range(100): # after 'come pescado' always comes 'y' token = generator.generate_token(('come', 'pescado')) self.assertEqual(token, 'y') # after 'come pescado y' always comes 'duerme' token = generator2.generate_token(('come', 'pescado', 'y')) self.assertEqual(token, 'duerme') # sentence may come start with 'el' or 'la' token = generator.generate_token(('<s>', '<s>')) self.assertTrue(token in ['el', 'la']) token = generator2.generate_token(('<s>', '<s>', '<s>')) self.assertTrue(token in ['el', 'la'])
def test_generate_token(self): ngram = NGram(2, self.sents) generator = NGramGenerator(ngram) for i in range(100): # after 'el' always comes 'gato': token = generator.generate_token(('el',)) self.assertEqual(token, 'gato') # after 'come' may come 'pescado' or 'salmón' token = generator.generate_token(('come',)) self.assertTrue(token in ['pescado', 'salmón'])
def test_perplexity_calculation(self): model = NGram(3, self.sents) model2 = NGram(4, self.sents) perplexity = 2**(-(log2(0.5)*2)/10.0) self.assertEqual(perplexity, model.perplexity(self.sents)) self.assertEqual(perplexity, model2.perplexity(self.sents))
def test_generate_sent_2gram(self): ngram = NGram(2, self.sents) generator = NGramGenerator(ngram) # all the possible generated sentences for 2-grams: sents = [ 'el gato come pescado .', 'la gata come salmón .', 'el gato come salmón .', 'la gata come pescado .', ] for i in range(100): sent = generator.generate_sent() self.assertTrue(' '.join(sent) in sents, sent)
def test_init_1gram(self): ngram = NGram(1, self.sents) counts = { (): 10, ('el',): 1, ('gato',): 1, ('come',): 2, ('pescado',): 1, ('.',): 2, ('la',): 1, ('gata',): 1, ('salmón',): 1, } self.assertEqual(dict(ngram.counts), counts)
def test_init_1gram(self): ngram = NGram(1, self.sents) generator = NGramGenerator(ngram) probs = { (): { 'el': 1 / 12.0, 'gato': 1 / 12.0, 'come': 2 / 12.0, 'pescado': 1 / 12.0, '.': 2 / 12.0, '</s>': 2 / 12.0, 'la': 1 / 12.0, 'gata': 1 / 12.0, 'salmón': 1 / 12.0, } } self.assertEqual(generator._probs, probs)
def test_sent_log_prob_3and4gram(self): ngram = NGram(3, self.sents2) ngram2 = NGram(4, self.sents2) sents = { 'el gato come pescado nuevo .': float('-inf'), # 'nuevo' unseen 'la la la': float('-inf'), # 'la' after 'la' unseen # after 'pescado': 'viejo' and 'fresco' have prob 0.5. 'el gato come pescado fresco . ': log2(0.5), 'el gato come pescado viejo . ': log2(0.5) } for sent, prob in sents.items(): self.assertAlmostEqual(ngram.sent_log_prob(sent.split()), prob, msg=sent) self.assertAlmostEqual(ngram2.sent_log_prob(sent.split()), prob, msg=sent)
def test_init_2gram(self): ngram = NGram(2, self.sents) counts = { ('el',): 1, ('gato',): 1, ('come',): 2, ('pescado',): 1, # ('.',): 2, ('la',): 1, ('gata',): 1, ('salmón',): 1, ('el', 'gato'): 1, ('gato', 'come'): 1, ('come', 'pescado'): 1, ('pescado', '.'): 1, ('la', 'gata'): 1, ('gata', 'come'): 1, ('come', 'salmón'): 1, ('salmón', '.'): 1, } self.assertEqual(dict(ngram.counts), counts)
def test_sent_prob_3and4gram(self): ngram = NGram(3, self.sents3) ngram2 = NGram(4, self.sents3) sents = { 'el gato come pescado y ronca .': 0.0, # 'ronca' unseen 'la la la': 0.0, # 'la' after 'la' unseen # la probabilidad se da por el principio, # si empieza con 'la' o 'el' 'el gato come pescado y duerme . ': 0.5, 'la gata come pescado y duerme . ': 0.5 } for sent, prob in sents.items(): self.assertAlmostEqual(ngram.sent_prob(sent.split()), prob, msg=sent) self.assertAlmostEqual(ngram2.sent_prob(sent.split()), prob, msg=sent)
def test_prob_1gram(self): ngram = NGram(1, self.sents) self.assertEqual(ngram.prob('pescado'), 0.1) self.assertEqual(ngram.prob('come'), 0.2)
def test_prob(self): ngram = NGram(2, self.sents) self.assertEqual(ngram.prob('pescado', ['come']), 0.5) self.assertEqual(ngram.prob('salmón', ['come']), 0.5)
# from languagemodeling.ngram import NGram, AddOneNGram, InterpolatedNGram # models = { # 'ngram': NGram, # 'addone': AddOneNGram, # 'inter': InterpolatedNGram, # } if __name__ == '__main__': opts = docopt(__doc__) # load the data # WORK HERE!! LOAD YOUR TRAINING CORPUS #sents = gutenberg.sents(['/Users/juliamilanese/Desktop/corpus_airbnb/txt']) corpus = PlaintextCorpusReader('path', '*.txt') #('corpus_airbnb/txt', '.*\.txt') sents = list(corpus.sents()) # train the model n = int(opts['-n']) model = NGram(n, sents) # model_class = models[opts['-m']] # model = model_class(n, sents) # save it filename = opts['-o'] f = open(filename, 'wb') pickle.dump(model, f) f.close()