def test_count_2gram(self): model = AddOneNGram(2, self.sents) counts = { ('<s>', ): 2, ('el', ): 1, ('gato', ): 1, ('come', ): 2, ('pescado', ): 1, ('.', ): 2, ('la', ): 1, ('gata', ): 1, ('salmón', ): 1, ('<s>', 'el'): 1, ('el', 'gato'): 1, ('gato', 'come'): 1, ('come', 'pescado'): 1, ('pescado', '.'): 1, ('.', '</s>'): 2, ('<s>', 'la'): 1, ('la', 'gata'): 1, ('gata', 'come'): 1, ('come', 'salmón'): 1, ('salmón', '.'): 1, } for gram, c in counts.items(): self.assertEqual(model.count(gram), c, gram) # size of the vocabulary self.assertEqual(model.V(), 9)
def test_count_2gram(self): model = AddOneNGram(2, self.sents) counts = { ('<s>',): 2, ('el',): 1, ('gato',): 1, ('come',): 2, ('pescado',): 1, ('.',): 2, ('la',): 1, ('gata',): 1, ('salmón',): 1, ('<s>', 'el'): 1, ('el', 'gato'): 1, ('gato', 'come'): 1, ('come', 'pescado'): 1, ('pescado', '.'): 1, ('.', '</s>'): 2, ('<s>', 'la'): 1, ('la', 'gata'): 1, ('gata', 'come'): 1, ('come', 'salmón'): 1, ('salmón', '.'): 1, } for gram, c in counts.items(): self.assertEqual(model.count(gram), c, gram) # size of the vocabulary self.assertEqual(model.V(), 9)
def test_norm_1gram(self): model = AddOneNGram(1, self.sents) tokens = {'el', 'gato', 'come', 'pescado', '.', 'la', 'gata', 'salmón', '</s>'} prob_sum = sum(model.cond_prob(token) for token in tokens) # prob_sum < 1.0 or almost equal to 1.0: self.assertTrue(prob_sum < 1.0 or abs(prob_sum - 1.0) < 1e-10)
def test_count_3_addone_ngram(self): sents_group = [self.sents2, self.sents] i = 0 for sents_chosen in sents_group: model = AddOneNGram(3, sents_chosen) # sent2 vocab size = 8 and sent vocab size = 9 # (this explains use of i) self.assertEqual(model.V(), 8+i) i += 1
def test_norm_2gram(self): model = AddOneNGram(2, self.sents) tokens = {'el', 'gato', 'come', 'pescado', '.', 'la', 'gata', 'salmón', '</s>'} for prev in list(tokens) + ['<s>']: prob_sum = sum(model.cond_prob(token, [prev]) for token in tokens) # prob_sum < 1.0 or almost equal to 1.0: self.assertTrue(prob_sum < 1.0 or abs(prob_sum - 1.0) < 1e-10)
def test_cond_prob_4_addone_gram(self): model = AddOneNGram(4, self.sents) probs = { ('pescado', ('el', 'gato', 'come')): (1.0 + 1.0) / (1.0 + 9.0), ('salmón', ('la', 'gata', 'come')): (1.0 + 1.0) / (1.0 + 9.0), ('salame', ('el', 'gato', 'come')): 1.0 / (1.0 + 9.0), } for (token, prev), p in probs.items(): self.assertEqual(model.cond_prob(token, list(prev)), p)
def test_cond_prob_2gram(self): model = AddOneNGram(2, self.sents) probs = { ('pescado', 'come'): (1.0 + 1.0) / (2.0 + 9.0), ('salmón', 'come'): (1.0 + 1.0) / (2.0 + 9.0), ('salame', 'come'): 1.0 / (2.0 + 9.0), } for (token, prev), p in probs.items(): self.assertAlmostEqual(model.cond_prob(token, (prev, )), p)
def test_cond_prob_1gram(self): model = AddOneNGram(1, self.sents) probs = { 'pescado': (1.0 + 1.0) / (self.total + 9.0), 'come': (2.0 + 1.0) / (self.total + 9.0), 'salame': 1.0 / (self.total + 9.0), } for token, p in probs.items(): self.assertAlmostEqual(model.cond_prob(token), p)
def test_cond_prob_2gram(self): model = AddOneNGram(2, self.sents) probs = { ('pescado', 'come'): (1.0 + 1.0) / (2.0 + 9.0), ('salmón', 'come'): (1.0 + 1.0) / (2.0 + 9.0), ('salame', 'come'): 1.0 / (2.0 + 9.0), } for (token, prev), p in probs.items(): self.assertEqual(model.cond_prob(token, [prev]), p)
def test_cond_prob_1gram(self): model = AddOneNGram(1, self.sents) probs = { 'pescado': (1.0 + 1.0) / (12.0 + 9.0), 'come': (2.0 + 1.0) / (12.0 + 9.0), 'salame': 1.0 / (12.0 + 9.0), } for token, p in probs.items(): self.assertEqual(model.cond_prob(token), p)
def test_norm_3gram(self): model = AddOneNGram(3, self.sents) tokens = {'el', 'gato', 'come', 'pescado', '.', 'la', 'gata', 'salmón', '</s>'} prevs = [['<s>', '<s>']] + \ [['<s>', t] for t in tokens] + \ [[t1, t2] for t1 in tokens for t2 in tokens] for prev in prevs: prob_sum = sum(model.cond_prob(token, prev) for token in tokens) # prob_sum < 1.0 or almost equal to 1.0: self.assertTrue(prob_sum < 1.0 or abs(prob_sum - 1.0) < 1e-10)
def test_get_most_probable_token_2gram(self): model = AddOneNGram(2, self.sents) ss = SentSorter(model) prev_tokens = [('<s>', ), ('el', ), ('la', ), ('gato', ), ('gata', )] possible_tokens = [['gato', 'come', 'el'], ['gata', 'gato'], ['gato', 'salta', 'gata'], ['salta', 'gata', 'come'], ['la', 'el', 'salta']] for p_tokens, prev in zip(possible_tokens, prev_tokens): self.assertEqual(ss.get_most_probable_token(p_tokens, prev_tokens), tok[-1])
def test_sort_probable_sents_one_trainig_sent(self): sents = self.sents[:1] for i in range(2, 4): model = AddOneNGram(i, sents) unordered_sents = map(str.split, [ 'el come gato', 'el gato come', 'gato come el', 'gato el come', 'come el gato', 'come gato el' ]) ss = SentSorter(model) sorted_sents = ss.sort_probable_sents(unordered_sents) for sent in sorted_sents: self.assertEqual(sent, sents[0])
| \$?\d+(?:\.\d+)?%? # currency and percentages, e.g. $12.40, 82% | \.\.\. # ellipsis | [][.,;"'?():-_`] # these are separate tokens; includes ], [ ''' tokenizer = RegexpTokenizer(pattern) root = '.' corpus = PlaintextCorpusReader(root, 'books\.txt', word_tokenizer=tokenizer) sents = corpus.sents() # train the model n = int(opts['-n']) if opts['-m'] == 'addone': model = AddOneNGram(n, sents) elif opts['-m'] == 'inter': gamma = opts['-g'] if gamma is None: model = InterpolatedNGram(n, sents, None, False) else: model = InterpolatedNGram(n, sents, gamma, False) elif opts['-m'] == 'interaddone': gamma = opts['-g'] if gamma is None: model = InterpolatedNGram(n, sents, None, True) else: model = InterpolatedNGram(n, sents, gamma, True) else: model = NGram(n, sents)