Exemple #1
0
    def test_count_3gram(self):
        sents = [
            'el gato come pescado .'.split(),
            'la gata come salmón .'.split(),
            'unaria'.split(),
        ]
        ngram = NGram(3, sents)

        counts = {
            ('<s>', 'el'): 1,
            ('el', 'gato'): 1,
            ('gato', 'come'): 1,
            ('come', 'pescado'): 1,
            ('pescado', '.'): 1,

            ('<s>', 'la'): 1,
            ('la', 'gata'): 1,
            ('gata', 'come'): 1,
            ('come', 'salmón'): 1,
            ('salmón', '.'): 1,
            ('<s>', 'unaria'): 1,

            ('<s>', 'el', 'gato'): 1,
            ('el', 'gato', 'come'): 1,
            ('gato', 'come', 'pescado'): 1,
            ('come', 'pescado', '.'): 1,
            ('<s>', 'la', 'gata'): 1,
            ('la', 'gata', 'come'): 1,
            ('gata', 'come', 'salmón'): 1,
            ('come', 'salmón', '.'): 1,
            ('salmón', '.', '</s>'): 1,
            ('<s>', 'unaria', '</s>'): 1,
        }
        for gram, c in counts.items():
            self.assertEqual(ngram.count(gram), c)
Exemple #2
0
    def test_count_3gram(self):
        ngram = NGram(3, self.sents)
        counts = {
            ('<s>', 'el'): 1,
            ('el', 'gato'): 1,
            ('gato', 'come'): 1,
            ('come', 'pescado'): 1,
            ('pescado', '.'): 1,
            ('<s>', '<s>'): 2,
            ('<s>', 'la'): 1,
            ('la', 'gata'): 1,
            ('gata', 'come'): 1,
            ('come', 'salmón'): 1,
            ('salmón', '.'): 1,
            ('<s>', '<s>', 'el'): 1,
            ('<s>', 'el', 'gato'): 1,
            ('el', 'gato', 'come'): 1,
            ('gato', 'come', 'pescado'): 1,
            ('come', 'pescado', '.'): 1,
            ('pescado', '.', '</s>'): 1,
            ('<s>', '<s>', 'la'): 1,
            ('<s>', 'la', 'gata'): 1,
            ('la', 'gata', 'come'): 1,
            ('gata', 'come', 'salmón'): 1,
            ('come', 'salmón', '.'): 1,
            ('salmón', '.', '</s>'): 1,
        }

        for gram, c in counts.items():
            self.assertEqual(ngram.count(gram), c)
Exemple #3
0
    def test_generate_sent_3and4gram(self):
        ngram = NGram(3, self.sents4)
        ngram2 = NGram(4, self.sents4)
        generator = NGramGenerator(ngram)
        generator2 = NGramGenerator(ngram2)

        # all the possible generated sentences for 3 or 4-grams:
        sents = [
            'la casa se construye y el corre y la gata come ensalada',
            'el corre y la gata come pescado y duerme',
            'la casa se construye y el corre y la gata come ensalada',
            'la casa se construye y el corre y la gata come pescado y duerme',
            'la casa se construye y el corre',
            'la gata come pescado y duerme',
            'el corre y la gata come ensalada',
            'el corre',
            'la gata come ensalada',
            'la casa se construye y el corre',
            'la gata come pescado y duerme',
        ]

        for i in range(1000):
            sent = generator.generate_sent()
            sent2 = generator2.generate_sent()
            self.assertTrue(' '.join(sent) in sents)
            self.assertTrue(' '.join(sent2) in sents)
Exemple #4
0
    def test_count_2gram(self):
        ngram = NGram(2, self.sents)

        counts = {
            ('<s>',): 2,
            ('el',): 1,
            ('gato',): 1,
            ('come',): 2,
            ('pescado',): 1,
            ('.',): 2,
            ('la',): 1,
            ('gata',): 1,
            ('salmón',): 1,
            ('<s>', 'el'): 1,
            ('el', 'gato'): 1,
            ('gato', 'come'): 1,
            ('come', 'pescado'): 1,
            ('pescado', '.'): 1,
            ('.', '</s>'): 2,
            ('<s>', 'la'): 1,
            ('la', 'gata'): 1,
            ('gata', 'come'): 1,
            ('come', 'salmón'): 1,
            ('salmón', '.'): 1,
        }
        for gram, c in counts.items():
            self.assertEqual(ngram.count(gram), c)
Exemple #5
0
    def test_cond_prob_2gram(self):
        ngram = NGram(2, self.sents)

        probs = {
            ('pescado', 'come'): 0.5,
            ('salmón', 'come'): 0.5,
            ('salame', 'come'): 0.0,
        }
        for (token, prev), p in probs.items():
            self.assertAlmostEqual(ngram.cond_prob(token, (prev,)), p)
Exemple #6
0
    def test_cond_prob_1gram(self):
        ngram = NGram(1, self.sents)

        probs = {
            'pescado': 1 / 12.0,
            'come': 2 / 12.0,
            'salame': 0.0,
        }
        for token, p in probs.items():
            self.assertEqual(ngram.cond_prob(token), p)
Exemple #7
0
    def test_cond_prob_1gram(self):
        ngram = NGram(1, self.sents)

        probs = {
            'pescado': 1 / 12.0,
            'come': 2 / 12.0,
            'salame': 0.0,
        }
        for token, p in probs.items():
            self.assertAlmostEqual(ngram.cond_prob(token), p)
Exemple #8
0
    def test_cond_prob_2gram(self):
        ngram = NGram(2, self.sents)

        probs = {
            ('pescado', 'come'): 0.5,
            ('salmón', 'come'): 0.5,
            ('salame', 'come'): 0.0,
        }
        for (token, prev), p in probs.items():
            self.assertEqual(ngram.cond_prob(token, [prev]), p)
Exemple #9
0
    def test_sent_prob_1gram(self):
        ngram = NGram(1, self.sents)

        sents = {
            # 'come', '.' and '</s>' have prob 1/6, the rest have 1/12.
            'el gato come pescado .': (1 / 6.0)**3 * (1 / 12.0)**3,
            'la gata come salmón .': (1 / 6.0)**3 * (1 / 12.0)**3,
            'el gato come salame .': 0.0,  # 'salame' unseen
            'la la la': (1 / 6.0)**1 * (1 / 12.0)**3,
        }
        for sent, prob in sents.items():
            self.assertAlmostEqual(ngram.sent_prob(sent.split()), prob, msg=sent)
Exemple #10
0
    def test_sent_prob_1gram(self):
        ngram = NGram(1, self.sents)

        sents = {
            # 'come', '.' and '</s>' have prob 1/6, the rest have 1/12.
            'el gato come pescado .': (1 / 6.0)**3 * (1 / 12.0)**3,
            'la gata come salmón .': (1 / 6.0)**3 * (1 / 12.0)**3,
            'el gato come salame .': 0.0,  # 'salame' unseen
            'la la la': (1 / 6.0)**1 * (1 / 12.0)**3,
        }
        for sent, prob in sents.items():
            self.assertAlmostEqual(ngram.sent_prob(sent.split()), prob, msg=sent)
Exemple #11
0
    def test_sent_log_prob_1gram(self):
        ngram = NGram(1, self.sents)

        def log2(x): return log(x, 2)
        sents = {
            # 'come', '.' and '</s>' have prob 1/6, the rest have 1/12.
            'el gato come pescado .': 3 * log2(1 / 6.0) + 3 * log2(1 / 12.0),
            'la gata come salmón .': 3 * log2(1 / 6.0) + 3 * log2(1 / 12.0),
            'el gato come salame .': -inf,  # 'salame' unseen
            'la la la': log2(1 / 6.0) + 3 * log2(1 / 12.0),
        }
        for sent, prob in sents.items():
            self.assertAlmostEqual(ngram.sent_log_prob(sent.split()), prob, msg=sent)
Exemple #12
0
    def test_cond_prob_3gram(self):
        ngram = NGram(3, self.sents)

        probs = {
            ('pescado', ('gato', 'come')): 1,
            ('salmón', ('gata', 'come')): 1,
            ('salame', ('gato', 'come')): 0.0,
            ('gato', ('<s>', 'el')): 1,
            ('gata', ('<s>', 'el')): 0.0,
        }

        for (token, prev), p in probs.items():
            self.assertEqual(ngram.cond_prob(token, list(prev)), p)
    def test_sent_prob_3gram(self):
        ngram = NGram(3, self.sents)

        sents = {
            # after '<s>, <s>': 'el' and 'la' have prob 0.5.
            'el gato come pescado .': 0.5 * 1 * 1 * 1 * 1,
            'la gata come salmón .': 0.5 * 1 * 1 * 1 * 1,
            'el gato come salmón .': 0.5 * 1 * 1 * 0 * 1 * 1,  # prob('gato come salmon') = 0 
            'el gato come salame .': 0.0,  # 'salame' unseen
            'la la la': 0.0,  # 'la' after 'la' unseen
        }
        for sent, prob in sents.items():
            self.assertAlmostEqual(ngram.sent_prob(sent.split()), prob, msg=sent)
    def test_cond_prob_3gram(self):
        ngram = NGram(3, self.sents)

        probs = {
            ('pescado', 'gato', 'come'): 1,
            ('salmón', 'gata', 'come'): 1,
            ('salame', 'gato', 'come'): 0.0,
            ('salmón', 'gato', 'come'): 0.0,
            ('el', '<s>', '<s>'): 0.5,
            ('</s>', 'pescado', '.'): 1,
        }
        for (token, prev1, prev2), p in probs.items():
            self.assertEqual(ngram.cond_prob(token, [prev1, prev2]), p)
    def test_cond_prob_3gram(self):
        ngram = NGram(3, self.sents)

        probs = {
            ('pescado', 'gato', 'come'): 1,
            ('salmón', 'gata', 'come'): 1,
            ('salame', 'gato', 'come'): 0.0,
            ('salmón', 'gato', 'come'): 0.0,
            ('el', '<s>', '<s>'): 0.5,
            ('</s>', 'pescado', '.'): 1,
        }
        for (token, prev1, prev2), p in probs.items():
            self.assertEqual(ngram.cond_prob(token, [prev1, prev2]), p)
Exemple #16
0
    def test_sent_log_prob_1gram(self):
        ngram = NGram(1, self.sents)

        log2 = lambda x: log(x, 2)
        sents = {
            # 'come', '.' and '</s>' have prob 1/6, the rest have 1/12.
            'el gato come pescado .': 3 * log2(1 / 6.0) + 3 * log2(1 / 12.0),
            'la gata come salmón .': 3 * log2(1 / 6.0) + 3 * log2(1 / 12.0),
            'el gato come salame .': float('-inf'),  # 'salame' unseen
            'la la la': log2(1 / 6.0) + 3 * log2(1 / 12.0),
        }
        for sent, prob in sents.items():
            self.assertAlmostEqual(ngram.sent_log_prob(sent.split()), prob, msg=sent)
Exemple #17
0
    def test_cond_prob_4gram(self):
        ngram = NGram(4, self.sents2)

        probs = {
            ('pescado', ('el', 'gato', 'come')): 1,
            ('salmón', ('la', 'gata', 'come')): 0.0,
            ('salame', ('el', 'gato', 'come')): 0.0,
            ('gato', ('<s>', '<s>', 'el')): 1,
            ('viejo', ('gato', 'come', 'pescado')): 0.5,
            ('fresco', ('gato', 'come', 'pescado')): 0.5,
        }

        for (token, prev), p in probs.items():
            self.assertEqual(ngram.cond_prob(token, list(prev)), p)
Exemple #18
0
    def test_sent_prob_2gram(self):
        ngram = NGram(2, self.sents)

        sents = {
            # after '<s>': 'el' and 'la' have prob 0.5.
            # after 'come': 'pescado' and 'salmón' have prob 0.5.
            'el gato come pescado .': 0.5 * 0.5,
            'la gata come salmón .': 0.5 * 0.5,
            'el gato come salmón .': 0.5 * 0.5,
            'la gata come pescado .': 0.5 * 0.5,
            'el gato come salame .': 0.0,  # 'salame' unseen
            'la la la': 0.0,  # 'la' after 'la' unseen
        }
        for sent, prob in sents.items():
            self.assertAlmostEqual(ngram.sent_prob(sent.split()), prob, msg=sent)
Exemple #19
0
    def test_sent_prob_2gram(self):
        ngram = NGram(2, self.sents)

        sents = {
            # after '<s>': 'el' and 'la' have prob 0.5.
            # after 'come': 'pescado' and 'salmón' have prob 0.5.
            'el gato come pescado .': 0.5 * 0.5,
            'la gata come salmón .': 0.5 * 0.5,
            'el gato come salmón .': 0.5 * 0.5,
            'la gata come pescado .': 0.5 * 0.5,
            'el gato come salame .': 0.0,  # 'salame' unseen
            'la la la': 0.0,  # 'la' after 'la' unseen
        }
        for sent, prob in sents.items():
            self.assertAlmostEqual(ngram.sent_prob(sent.split()), prob, msg=sent)
Exemple #20
0
    def test_sent_log_prob_2gram(self):
        ngram = NGram(2, self.sents)

        log2 = lambda x: log(x, 2)
        sents = {
            # after '<s>': 'el' and 'la' have prob 0.5.
            # after 'come': 'pescado' and 'salmón' have prob 0.5.
            'el gato come pescado .': 2 * log2(0.5),
            'la gata come salmón .': 2 * log2(0.5),
            'el gato come salmón .': 2 * log2(0.5),
            'la gata come pescado .': 2 * log2(0.5),
            'el gato come salame .': float('-inf'),  # 'salame' unseen
            'la la la': float('-inf'),  # 'la' after 'la' unseen
        }
        for sent, prob in sents.items():
            self.assertAlmostEqual(ngram.sent_log_prob(sent.split()), prob, msg=sent)
    def test_sent_prob_3gram(self):
        ngram = NGram(3, self.sents)

        sents = {
            # after '<s>, <s>': 'el' and 'la' have prob 0.5.
            'el gato come pescado .': 0.5 * 1 * 1 * 1 * 1,
            'la gata come salmón .': 0.5 * 1 * 1 * 1 * 1,
            'el gato come salmón .':
            0.5 * 1 * 1 * 0 * 1 * 1,  # prob('gato come salmon') = 0 
            'el gato come salame .': 0.0,  # 'salame' unseen
            'la la la': 0.0,  # 'la' after 'la' unseen
        }
        for sent, prob in sents.items():
            self.assertAlmostEqual(ngram.sent_prob(sent.split()),
                                   prob,
                                   msg=sent)
Exemple #22
0
    def test_init_3gram(self):
        ngram = NGram(3, self.sents)
        generator = NGramGenerator(ngram)

        probs = {
            ('<s>', '<s>'): {'el': 0.5, 'la': 0.5},
            ('<s>', 'el',): {'gato': 1.0},
            ('el', 'gato'): {'come': 1.0},
            ('gato', 'come'): {'pescado': 1.0},
            ('come', 'pescado'): {'.': 1.0},
            ('pescado', '.'): {'</s>': 1.0},
            ('<s>', 'la'): {'gata': 1.0},
            ('la', 'gata'): {'come': 1.0},
            ('gata', 'come'): {'salmón': 1.0},
            ('come', 'salmón'): {'.': 1.0},
            ('salmón', '.'): {'</s>': 1.0},

        }
        sorted_probs = {
            ('<s>', '<s>'): [('el', 0.5), ('la', 0.5)],
            ('<s>', 'el',): [('gato', 1.0)],
            ('el', 'gato'): [('come', 1.0)],
            ('gato', 'come'): [('pescado', 1.0)],
            ('come', 'pescado'): [('.', 1.0)],
            ('pescado', '.'): [('</s>', 1.0)],
            ('<s>', 'la'): [('gata', 1.0)],
            ('la', 'gata'): [('come', 1.0)],
            ('gata', 'come'): [('salmón', 1.0)],
            ('come', 'salmón'): [('.', 1.0)],
            ('salmón', '.'): [('</s>', 1.0)],
        }

        self.assertEqual(dict(generator.probs), probs)
        self.assertEqual(generator.sorted_probs, sorted_probs)
Exemple #23
0
    def test_sent_log_prob_2gram(self):
        ngram = NGram(2, self.sents)

        def log2(x): return log(x, 2)
        sents = {
            # after '<s>': 'el' and 'la' have prob 0.5.
            # after 'come': 'pescado' and 'salmón' have prob 0.5.
            'el gato come pescado .': 2 * log2(0.5),
            'la gata come salmón .': 2 * log2(0.5),
            'el gato come salmón .': 2 * log2(0.5),
            'la gata come pescado .': 2 * log2(0.5),
            'el gato come salame .': -inf,  # 'salame' unseen
            'la la la': -inf,  # 'la' after 'la' unseen
        }
        for sent, prob in sents.items():
            self.assertAlmostEqual(ngram.sent_log_prob(sent.split()), prob, msg=sent)
Exemple #24
0
    def test_count_1gram(self):
        ngram = NGram(1, self.sents)

        counts = {
            (): 12,
            ('el',): 1,
            ('gato',): 1,
            ('come',): 2,
            ('pescado',): 1,
            ('.',): 2,
            ('</s>',): 2,
            ('la',): 1,
            ('gata',): 1,
            ('salmón',): 1,
        }
        for gram, c in counts.items():
            self.assertEqual(ngram.count(gram), c)
    def test_sent_log_prob_3gram(self):
        ngram = NGram(3, self.sents)

        log2 = lambda x: log(x, 2)
        sents = {
            # after '<s>': 'el' and 'la' have prob 0.5.
            # after 'come': 'pescado' and 'salmón' have prob 0.5.
            'el gato come pescado .': log2(0.5),
            'la gata come salmón .': log2(0.5),
            'el gato come salmón .': float('-inf'),
            'la gata come pescado .': float('-inf'),
            'el gato come salame .': float('-inf'),  # 'salame' unseen
            'la la la': float('-inf'),  # 'la' after 'la' unseen
        }
        for sent, prob in sents.items():
            self.assertAlmostEqual(ngram.sent_log_prob(sent.split()),
                                   prob,
                                   msg=sent)
    def test_generate_sent_1gram(self):
        ngram = NGram(1, self.sents)
        generator = NGramGenerator(ngram)

        voc = {'el', 'gato', 'come', 'pescado', '.', 'la', 'gata', 'salmón'}

        for i in range(100):
            sent = generator.generate_sent()
            self.assertTrue(set(sent).issubset(voc))
Exemple #27
0
    def test_generate_token_3and4gram(self):
        ngram = NGram(3, self.sents3)
        ngram2 = NGram(4, self.sents3)
        generator = NGramGenerator(ngram)
        generator2 = NGramGenerator(ngram2)

        for i in range(100):
            # after 'come pescado' always comes 'y'
            token = generator.generate_token(('come', 'pescado'))
            self.assertEqual(token, 'y')
            # after 'come pescado y' always comes 'duerme'
            token = generator2.generate_token(('come', 'pescado', 'y'))
            self.assertEqual(token, 'duerme')
            # sentence may come start with 'el' or 'la'
            token = generator.generate_token(('<s>', '<s>'))
            self.assertTrue(token in ['el', 'la'])
            token = generator2.generate_token(('<s>', '<s>', '<s>'))
            self.assertTrue(token in ['el', 'la'])
    def test_generate_token(self):
        ngram = NGram(2, self.sents)
        generator = NGramGenerator(ngram)

        for i in range(100):
            # after 'el' always comes 'gato':
            token = generator.generate_token(('el',))
            self.assertEqual(token, 'gato')

            # after 'come' may come 'pescado' or 'salmón'
            token = generator.generate_token(('come',))
            self.assertTrue(token in ['pescado', 'salmón'])
Exemple #29
0
    def test_perplexity_calculation(self):
        model = NGram(3, self.sents)
        model2 = NGram(4, self.sents)
        perplexity = 2**(-(log2(0.5)*2)/10.0)

        self.assertEqual(perplexity, model.perplexity(self.sents))
        self.assertEqual(perplexity, model2.perplexity(self.sents))
    def test_generate_sent_2gram(self):
        ngram = NGram(2, self.sents)
        generator = NGramGenerator(ngram)

        # all the possible generated sentences for 2-grams:
        sents = [
            'el gato come pescado .',
            'la gata come salmón .',
            'el gato come salmón .',
            'la gata come pescado .',
        ]

        for i in range(100):
            sent = generator.generate_sent()
            self.assertTrue(' '.join(sent) in sents, sent)
    def test_init_1gram(self):
        ngram = NGram(1, self.sents)

        counts = {
            (): 10,
            ('el',): 1,
            ('gato',): 1,
            ('come',): 2,
            ('pescado',): 1,
            ('.',): 2,
            ('la',): 1,
            ('gata',): 1,
            ('salmón',): 1,
        }

        self.assertEqual(dict(ngram.counts), counts)
    def test_init_1gram(self):
        ngram = NGram(1, self.sents)
        generator = NGramGenerator(ngram)

        probs = {
            (): {
                'el': 1 / 12.0,
                'gato': 1 / 12.0,
                'come': 2 / 12.0,
                'pescado': 1 / 12.0,
                '.': 2 / 12.0,
                '</s>': 2 / 12.0,
                'la': 1 / 12.0,
                'gata': 1 / 12.0,
                'salmón': 1 / 12.0,
            }
        }

        self.assertEqual(generator._probs, probs)
Exemple #33
0
    def test_sent_log_prob_3and4gram(self):
        ngram = NGram(3, self.sents2)
        ngram2 = NGram(4, self.sents2)

        sents = {
            'el gato come pescado nuevo .': float('-inf'),  # 'nuevo' unseen
            'la la la': float('-inf'),  # 'la' after 'la' unseen
            # after 'pescado': 'viejo' and 'fresco' have prob 0.5.
            'el gato come pescado fresco . ': log2(0.5),
            'el gato come pescado viejo . ': log2(0.5)
        }
        for sent, prob in sents.items():
            self.assertAlmostEqual(ngram.sent_log_prob(sent.split()),
                                   prob, msg=sent)
            self.assertAlmostEqual(ngram2.sent_log_prob(sent.split()),
                                   prob, msg=sent)
    def test_init_2gram(self):
        ngram = NGram(2, self.sents)

        counts = {
            ('el',): 1,
            ('gato',): 1,
            ('come',): 2,
            ('pescado',): 1,
            # ('.',): 2,
            ('la',): 1,
            ('gata',): 1,
            ('salmón',): 1,
            ('el', 'gato'): 1,
            ('gato', 'come'): 1,
            ('come', 'pescado'): 1,
            ('pescado', '.'): 1,
            ('la', 'gata'): 1,
            ('gata', 'come'): 1,
            ('come', 'salmón'): 1,
            ('salmón', '.'): 1,
        }

        self.assertEqual(dict(ngram.counts), counts)
Exemple #35
0
    def test_sent_prob_3and4gram(self):
        ngram = NGram(3, self.sents3)
        ngram2 = NGram(4, self.sents3)

        sents = {
            'el gato come pescado y ronca .': 0.0,  # 'ronca' unseen
            'la la la': 0.0,  # 'la' after 'la' unseen
            # la probabilidad se da por el principio,
            # si empieza con 'la' o 'el'
            'el gato come pescado y duerme . ': 0.5,
            'la gata come pescado y duerme . ': 0.5
        }

        for sent, prob in sents.items():
            self.assertAlmostEqual(ngram.sent_prob(sent.split()),
                                   prob, msg=sent)
            self.assertAlmostEqual(ngram2.sent_prob(sent.split()),
                                   prob, msg=sent)
    def test_prob_1gram(self):
        ngram = NGram(1, self.sents)

        self.assertEqual(ngram.prob('pescado'), 0.1)
        self.assertEqual(ngram.prob('come'), 0.2)
    def test_prob(self):
        ngram = NGram(2, self.sents)

        self.assertEqual(ngram.prob('pescado', ['come']), 0.5)
        self.assertEqual(ngram.prob('salmón', ['come']), 0.5)
# from languagemodeling.ngram import NGram, AddOneNGram, InterpolatedNGram

# models = {
#     'ngram': NGram,
#     'addone': AddOneNGram,
#     'inter': InterpolatedNGram,
# }

if __name__ == '__main__':
    opts = docopt(__doc__)

    # load the data
    # WORK HERE!! LOAD YOUR TRAINING CORPUS
    #sents = gutenberg.sents(['/Users/juliamilanese/Desktop/corpus_airbnb/txt'])

    corpus = PlaintextCorpusReader('path',
                                   '*.txt')  #('corpus_airbnb/txt', '.*\.txt')
    sents = list(corpus.sents())

    # train the model
    n = int(opts['-n'])
    model = NGram(n, sents)
    # model_class = models[opts['-m']]
    # model = model_class(n, sents)

    # save it
    filename = opts['-o']
    f = open(filename, 'wb')
    pickle.dump(model, f)
    f.close()