def test_count_2gram(self):
        model = AddOneNGram(2, self.sents)

        counts = {
            ('<s>', ): 2,
            ('el', ): 1,
            ('gato', ): 1,
            ('come', ): 2,
            ('pescado', ): 1,
            ('.', ): 2,
            ('la', ): 1,
            ('gata', ): 1,
            ('salmón', ): 1,
            ('<s>', 'el'): 1,
            ('el', 'gato'): 1,
            ('gato', 'come'): 1,
            ('come', 'pescado'): 1,
            ('pescado', '.'): 1,
            ('.', '</s>'): 2,
            ('<s>', 'la'): 1,
            ('la', 'gata'): 1,
            ('gata', 'come'): 1,
            ('come', 'salmón'): 1,
            ('salmón', '.'): 1,
        }
        for gram, c in counts.items():
            self.assertEqual(model.count(gram), c, gram)

        # size of the vocabulary
        self.assertEqual(model.V(), 9)
Example #2
0
    def test_count_2gram(self):
        model = AddOneNGram(2, self.sents)

        counts = {
            ('<s>',): 2,
            ('el',): 1,
            ('gato',): 1,
            ('come',): 2,
            ('pescado',): 1,
            ('.',): 2,
            ('la',): 1,
            ('gata',): 1,
            ('salmón',): 1,
            ('<s>', 'el'): 1,
            ('el', 'gato'): 1,
            ('gato', 'come'): 1,
            ('come', 'pescado'): 1,
            ('pescado', '.'): 1,
            ('.', '</s>'): 2,
            ('<s>', 'la'): 1,
            ('la', 'gata'): 1,
            ('gata', 'come'): 1,
            ('come', 'salmón'): 1,
            ('salmón', '.'): 1,
        }
        for gram, c in counts.items():
            self.assertEqual(model.count(gram), c, gram)

        # size of the vocabulary
        self.assertEqual(model.V(), 9)
Example #3
0
    def test_norm_1gram(self):
        model = AddOneNGram(1, self.sents)

        tokens = {'el', 'gato', 'come', 'pescado', '.', 'la', 'gata', 'salmón', '</s>'}

        prob_sum = sum(model.cond_prob(token) for token in tokens)
        # prob_sum < 1.0 or almost equal to 1.0:
        self.assertTrue(prob_sum < 1.0 or abs(prob_sum - 1.0) < 1e-10)
Example #4
0
 def test_count_3_addone_ngram(self):
     sents_group = [self.sents2, self.sents]
     i = 0
     for sents_chosen in sents_group:
         model = AddOneNGram(3, sents_chosen)
         # sent2 vocab size = 8 and sent vocab size = 9
         # (this explains use of i)
         self.assertEqual(model.V(), 8+i)
         i += 1
Example #5
0
    def test_norm_2gram(self):
        model = AddOneNGram(2, self.sents)

        tokens = {'el', 'gato', 'come', 'pescado', '.', 'la', 'gata', 'salmón', '</s>'}

        for prev in list(tokens) + ['<s>']:
            prob_sum = sum(model.cond_prob(token, [prev]) for token in tokens)
            # prob_sum < 1.0 or almost equal to 1.0:
            self.assertTrue(prob_sum < 1.0 or abs(prob_sum - 1.0) < 1e-10)
Example #6
0
    def test_cond_prob_4_addone_gram(self):
        model = AddOneNGram(4, self.sents)

        probs = {
            ('pescado', ('el', 'gato', 'come')): (1.0 + 1.0) / (1.0 + 9.0),
            ('salmón', ('la', 'gata', 'come')): (1.0 + 1.0) / (1.0 + 9.0),
            ('salame', ('el', 'gato', 'come')): 1.0 / (1.0 + 9.0),
        }
        for (token, prev), p in probs.items():
            self.assertEqual(model.cond_prob(token, list(prev)), p)
    def test_cond_prob_2gram(self):
        model = AddOneNGram(2, self.sents)

        probs = {
            ('pescado', 'come'): (1.0 + 1.0) / (2.0 + 9.0),
            ('salmón', 'come'): (1.0 + 1.0) / (2.0 + 9.0),
            ('salame', 'come'): 1.0 / (2.0 + 9.0),
        }
        for (token, prev), p in probs.items():
            self.assertAlmostEqual(model.cond_prob(token, (prev, )), p)
    def test_cond_prob_1gram(self):
        model = AddOneNGram(1, self.sents)

        probs = {
            'pescado': (1.0 + 1.0) / (self.total + 9.0),
            'come': (2.0 + 1.0) / (self.total + 9.0),
            'salame': 1.0 / (self.total + 9.0),
        }
        for token, p in probs.items():
            self.assertAlmostEqual(model.cond_prob(token), p)
Example #9
0
    def test_cond_prob_2gram(self):
        model = AddOneNGram(2, self.sents)

        probs = {
            ('pescado', 'come'): (1.0 + 1.0) / (2.0 + 9.0),
            ('salmón', 'come'): (1.0 + 1.0) / (2.0 + 9.0),
            ('salame', 'come'): 1.0 / (2.0 + 9.0),
        }
        for (token, prev), p in probs.items():
            self.assertEqual(model.cond_prob(token, [prev]), p)
Example #10
0
    def test_cond_prob_1gram(self):
        model = AddOneNGram(1, self.sents)

        probs = {
            'pescado': (1.0 + 1.0) / (12.0 + 9.0),
            'come': (2.0 + 1.0) / (12.0 + 9.0),
            'salame': 1.0 / (12.0 + 9.0),
        }
        for token, p in probs.items():
            self.assertEqual(model.cond_prob(token), p)
Example #11
0
    def test_norm_3gram(self):
        model = AddOneNGram(3, self.sents)

        tokens = {'el', 'gato', 'come', 'pescado', '.', 'la', 'gata', 'salmón', '</s>'}
        prevs = [['<s>', '<s>']] + \
            [['<s>', t] for t in tokens] + \
            [[t1, t2] for t1 in tokens for t2 in tokens]

        for prev in prevs:
            prob_sum = sum(model.cond_prob(token, prev) for token in tokens)
            # prob_sum < 1.0 or almost equal to 1.0:
            self.assertTrue(prob_sum < 1.0 or abs(prob_sum - 1.0) < 1e-10)
Example #12
0
    def test_get_most_probable_token_2gram(self):
        model = AddOneNGram(2, self.sents)
        ss = SentSorter(model)
        prev_tokens = [('<s>', ), ('el', ), ('la', ), ('gato', ), ('gata', )]

        possible_tokens = [['gato', 'come', 'el'], ['gata', 'gato'],
                           ['gato', 'salta', 'gata'],
                           ['salta', 'gata', 'come'], ['la', 'el', 'salta']]

        for p_tokens, prev in zip(possible_tokens, prev_tokens):
            self.assertEqual(ss.get_most_probable_token(p_tokens, prev_tokens),
                             tok[-1])
Example #13
0
 def test_sort_probable_sents_one_trainig_sent(self):
     sents = self.sents[:1]
     for i in range(2, 4):
         model = AddOneNGram(i, sents)
         unordered_sents = map(str.split, [
             'el come gato', 'el gato come', 'gato come el', 'gato el come',
             'come el gato', 'come gato el'
         ])
         ss = SentSorter(model)
         sorted_sents = ss.sort_probable_sents(unordered_sents)
         for sent in sorted_sents:
             self.assertEqual(sent, sents[0])
Example #14
0
        | \$?\d+(?:\.\d+)?%?  # currency and percentages, e.g. $12.40, 82%
        | \.\.\.            # ellipsis
        | [][.,;"'?():-_`]  # these are separate tokens; includes ], [
    '''
    tokenizer = RegexpTokenizer(pattern)

    root = '.'
    corpus = PlaintextCorpusReader(root, 'books\.txt', word_tokenizer=tokenizer)

    sents = corpus.sents()

    # train the model
    n = int(opts['-n'])

    if opts['-m'] == 'addone':
        model = AddOneNGram(n, sents)
    elif opts['-m'] == 'inter':
        gamma = opts['-g']
        if gamma is None:
            model = InterpolatedNGram(n, sents, None, False)
        else:
            model = InterpolatedNGram(n, sents, gamma, False)
    elif opts['-m'] == 'interaddone':
        gamma = opts['-g']
        if gamma is None:
            model = InterpolatedNGram(n, sents, None, True)
        else:
            model = InterpolatedNGram(n, sents, gamma, True)
    else:
        model = NGram(n, sents)