def test_count_2gram(self): ngram = InterpolatedNGram(2, self.sents, gamma=1.0) counts = { (): 12, ('el',): 1, ('gato',): 1, ('come',): 2, ('pescado',): 1, ('.',): 2, ('</s>',): 2, ('la',): 1, ('gata',): 1, ('salmón',): 1, ('<s>', 'el'): 1, ('el', 'gato'): 1, ('gato', 'come'): 1, ('come', 'pescado'): 1, ('pescado', '.'): 1, ('.', '</s>'): 2, ('<s>', 'la'): 1, ('la', 'gata'): 1, ('gata', 'come'): 1, ('come', 'salmón'): 1, ('salmón', '.'): 1, } for gram, c in counts.items(): self.assertEqual(ngram.count(gram), c, gram)
def test_cond_prob_1gram_no_addone(self): model = InterpolatedNGram(1, self.sents, gamma=1.0, addone=False) # behaves just like unsmoothed n-gram probs = { 'pescado': 1 / 12.0, 'come': 2 / 12.0, 'salame': 0.0, } for token, p in probs.items(): self.assertAlmostEqual(model.cond_prob(token), p, msg=token)
def test_cond_prob_2gram_no_addone(self): gamma = 1.0 model = InterpolatedNGram(2, self.sents, gamma, addone=False) c1 = 2.0 # count for 'come' (and '.') l1 = c1 / (c1 + gamma) probs = { ('pescado', 'come'): l1 * 0.5 + (1.0 - l1) * 1 / 12.0, ('salmón', 'come'): l1 * 0.5 + (1.0 - l1) * 1 / 12.0, ('salame', 'come'): 0.0, ('</s>', '.'): l1 * 1.0 + (1.0 - l1) * 2 / 12.0, } for (token, prev), p in probs.items(): self.assertAlmostEqual(model.cond_prob(token, [prev]), p, msg=token)
def test_held_out(self): model = InterpolatedNGram(1, self.sents) # only first sentence (second sentence is held-out data) counts = { (): 6, ('el',): 1, ('gato',): 1, ('come',): 1, ('pescado',): 1, ('.',): 1, ('</s>',): 1, } for gram, c in counts.items(): self.assertEqual(model.count(gram), c, gram)
def test_count_1gram(self): model = InterpolatedNGram(1, self.sents, gamma=1.0) counts = { (): 12, ('el',): 1, ('gato',): 1, ('come',): 2, ('pescado',): 1, ('.',): 2, ('</s>',): 2, ('la',): 1, ('gata',): 1, ('salmón',): 1, } for gram, c in counts.items(): self.assertEqual(model.count(gram), c, gram)
def test_norm_1gram(self): models = [ InterpolatedNGram(1, self.sents, gamma=1.0, addone=False), InterpolatedNGram(1, self.sents, gamma=5.0, addone=False), InterpolatedNGram(1, self.sents, gamma=10.0, addone=False), InterpolatedNGram(1, self.sents, gamma=50.0, addone=False), InterpolatedNGram(1, self.sents, gamma=100.0, addone=False), InterpolatedNGram(1, self.sents, gamma=1.0, addone=True), InterpolatedNGram(1, self.sents, gamma=5.0, addone=True), InterpolatedNGram(1, self.sents, gamma=10.0, addone=True), InterpolatedNGram(1, self.sents, gamma=50.0, addone=True), InterpolatedNGram(1, self.sents, gamma=100.0, addone=True), ] tokens = { 'el', 'gato', 'come', 'pescado', '.', 'la', 'gata', 'salmón', '</s>' } for model in models: prob_sum = sum(model.cond_prob(token) for token in tokens) # prob_sum < 1.0 or almost equal to 1.0: self.assertAlmostLessEqual(prob_sum, 1.0)
tokenizer = RegexpTokenizer(pattern) root = '.' corpus = PlaintextCorpusReader(root, 'books\.txt', word_tokenizer=tokenizer) sents = corpus.sents() # train the model n = int(opts['-n']) if opts['-m'] == 'addone': model = AddOneNGram(n, sents) elif opts['-m'] == 'inter': gamma = opts['-g'] if gamma is None: model = InterpolatedNGram(n, sents, None, False) else: model = InterpolatedNGram(n, sents, gamma, False) elif opts['-m'] == 'interaddone': gamma = opts['-g'] if gamma is None: model = InterpolatedNGram(n, sents, None, True) else: model = InterpolatedNGram(n, sents, gamma, True) else: model = NGram(n, sents) # save it filename = opts['-o'] f = open(filename, 'wb') pickle.dump(model, f)
word_tokenizer=tokenizer, sent_tokenizer=sent_tokenizer) # sents will be a tokens' list of the corpus sents = corpus.sents() # train the model type_model = opts['-m'] n = int(opts['-n']) if type_model == 'ngram': model = NGram(n, sents) print(str(n) + '-gram will be ready') elif type_model == 'addone': model = AddOneNGram(n, sents) print(str(n) + '-addone will be ready') elif type_model == 'interpolated': model = InterpolatedNGram(n, sents) print(str(n) + '-interpolated will be ready') elif type_model == 'backoff': model = BackOffNGram(n, sents) print(str(n) + '-backoff will be ready') else: print('modelo erroneo') exit(0) # save it filename = opts['-o'] f = open(filename, 'wb') # to load a object pickle.load(file) # dump save the object in bytes pickle.dump(model, f) f.close()
# order of the model n = int(opts['-n']) # model type m = str(opts['-m']) filename = opts['-o'] # train the model if m == "ngram": print("NGram Model selected") model = NGram(n, sents) elif m == "addone": print("AddOne NGram Model selected") model = AddOneNGram(n, sents) elif m == "interpolated": print("Interpolated NGram Model selected") model = InterpolatedNGram(n, sents, addone=True) elif m == "backoff": print("BackOff NGram Model selected") model = BackOffNGram(n, sents, addone=True) else: print("Bad Model Type") print(help()) exit() print("n: %d\nOutput file: %s\n" % (n, filename)) # save it f = open(filename, 'wb') pickle.dump(model, f) f.close()