def setUp(self):
     comments = [[
         'xxxxx',
         ['where', 'where', 'where', 'are', 'are', 'you', 'you', 'now'], 1
     ],
                 [
                     'yyyyy',
                     [
                         'who', 'who', 'who', 'are', 'are', 'you', 'you',
                         'you'
                     ], 0
                 ]]
     self.vocab = NgramModel(comments)
Esempio n. 2
0
import unittest
from collections import defaultdict
from quora_ngram import NgramModel
import pprint

pp = pprint.PrettyPrinter(indent=4)

comments = [[
    'xxxxx', ['where', 'where', 'where', 'are', 'are', 'you', 'you', 'now'], 1
], ['yyyyy', ['who', 'who', 'who', 'are', 'are', 'you', 'you', 'you'], 0]]
ngrams = NgramModel(comments)
#gram_models = [ngrams._additive_smoothing_(gram_length=1,param=0,counts=False,logs=False),
#               ngrams._additive_smoothing_(gram_length=2,param=0,counts=False,logs=False),
#               ngrams._additive_smoothing_(gram_length=3,param=0,counts=True,logs=False)]
#pp.pprint(gram_models)
#pp.pprint(ngrams._smoothed_gram_freqs_('who_are_are',gram_models,0.5,logs=False))
#
#test_level_jm_1 = ngrams.jelinek_mercer(1,cnvx_param=0.5,scnd_smoother='additive',param=0)
#pp.pprint(test_level_jm_1)
#test_level_jm_2 = ngrams.jelinek_mercer(2,cnvx_param=0.5,scnd_smoother='additive',param=0)
#pp.pprint(test_level_jm_2)
#test_level_jm_3 = ngrams.jelinek_mercer(3,cnvx_param=0.5,scnd_smoother='additive',param=0)
#pp.pprint(test_level_jm_3)

ngrams.train_classifier(gram_length=2,
                        smoothing='jelinek-mercer',
                        scnd_smoother='additive',
                        cnvx_param=0.5,
                        smth_param=1)
pp.pprint(ngrams.gram_frequency)
class TestQuoraNgramAdditiveFreq(unittest.TestCase):
    def setUp(self):
        comments = [['xxxxx', ['where', 'are', 'you'], 1],
                    ['yyyyy', ['who', 'are', 'you'], 0]]
        self.vocab = NgramModel(comments)

    def tearDown(self):
        self.vocab = None

    def test_unigram_freq(self):
        self.vocab.train_classifier(gram_length=1,
                                    smoothing='additive',
                                    param=1)
        unigram_freqs_target = [{
            'where': np.log((1 + 1) / (1 * 3 + 3)),
            'are': np.log((1 + 1) / (1 * 3 + 3)),
            'you': np.log((1 + 1) / (1 * 3 + 3)),
            '<unk>': np.log((1 + 1) / (1 * 3 + 3))
        }, {
            'who': np.log((1 + 1) / (1 * 3 + 3)),
            'are': np.log((1 + 1) / (1 * 3 + 3)),
            'you': np.log((1 + 1) / (1 * 3 + 3)),
            '<unk>': np.log((1 + 1) / (1 * 3 + 3))
        }]
        self.assertDictEqual(self.vocab.gram_frequency[0],
                             unigram_freqs_target[1])
        self.assertDictEqual(self.vocab.gram_frequency[1],
                             unigram_freqs_target[0])

    def test_bigram_freq(self):
        self.vocab.train_classifier(gram_length=2,
                                    smoothing='additive',
                                    param=1)
        bigram_freqs_target = [{
            'where_are': np.log((1 + 1) / (1 * 2 + 2)),
            'are_you': np.log((1 + 1) / (1 * 2 + 2)),
            '<unk>': np.log((1 + 1) / (1 * 2 + 2))
        }, {
            'who_are': np.log((1 + 1) / (1 * 2 + 2)),
            'are_you': np.log((1 + 1) / (1 * 2 + 2)),
            '<unk>': np.log((1 + 1) / (1 * 2 + 2))
        }]
        self.assertDictEqual(self.vocab.gram_frequency[0],
                             bigram_freqs_target[1])
        self.assertDictEqual(self.vocab.gram_frequency[1],
                             bigram_freqs_target[0])

    def test_trigram_freq(self):
        self.vocab.train_classifier(gram_length=3,
                                    smoothing='additive',
                                    param=1)
        trigram_freqs_target = [{
            'where_are_you': np.log((1 + 1) / (1 * 1 + 1)),
            '<unk>': np.log((1 + 1) / (1 * 1 + 1))
        }, {
            'who_are_you': np.log((1 + 1) / (1 * 1 + 1)),
            '<unk>': np.log((1 + 1) / (1 * 1 + 1))
        }]
        self.assertDictEqual(self.vocab.gram_frequency[0],
                             trigram_freqs_target[1])
        self.assertDictEqual(self.vocab.gram_frequency[1],
                             trigram_freqs_target[0])
class TestQuoraNgramJMFreq(unittest.TestCase):
    def setUp(self):
        comments = [[
            'xxxxx',
            ['where', 'where', 'where', 'are', 'are', 'you', 'you', 'now'], 1
        ],
                    [
                        'yyyyy',
                        [
                            'who', 'who', 'who', 'are', 'are', 'you', 'you',
                            'you'
                        ], 0
                    ]]
        self.vocab = NgramModel(comments)

    def tearDown(self):
        self.vocab = None

    def test_unigram_smoothed_freq(self):
        gram_models = [
            self.vocab._additive_smoothing_(gram_length=1,
                                            param=0,
                                            counts=False,
                                            logs=False)
        ]
        who_smoothed = [0.5 * (3 / 8 + 1 / 8), 0.5 * (0 + 1 / 8)]
        are_smoothed = [0.5 * (2 / 8 + 1 / 8), 0.5 * (2 / 8 + 1 / 8)]
        self.assertListEqual(
            self.vocab._smoothed_gram_freqs_('who',
                                             gram_models,
                                             0.5,
                                             logs=False), who_smoothed)
        self.assertListEqual(
            self.vocab._smoothed_gram_freqs_('are',
                                             gram_models,
                                             0.5,
                                             logs=False), are_smoothed)

    def test_bigram_smoothed_freq(self):
        gram_models = [
            self.vocab._additive_smoothing_(gram_length=1,
                                            param=0,
                                            counts=False,
                                            logs=False),
            self.vocab._additive_smoothing_(gram_length=2,
                                            param=0,
                                            counts=False,
                                            logs=False)
        ]
        who_smoothed = [0.5 * (3 / 8 + 1 / 8), 0.5 * (0 + 1 / 8)]
        are_smoothed = [0.5 * (2 / 8 + 1 / 8), 0.5 * (2 / 8 + 1 / 8)]
        who_are_smoothed = [
            0.5 * (1 / 7 + who_smoothed[0] * are_smoothed[0]),
            0.5 * (0 + who_smoothed[1] * are_smoothed[1])
        ]
        are_are_smoothed = [
            0.5 * (1 / 7 + are_smoothed[0] * are_smoothed[0]),
            0.5 * (1 / 7 + are_smoothed[1] * are_smoothed[1])
        ]

        self.assertListEqual(
            self.vocab._smoothed_gram_freqs_('who_are',
                                             gram_models,
                                             0.5,
                                             logs=False), who_are_smoothed)
        self.assertListEqual(
            self.vocab._smoothed_gram_freqs_('are_are',
                                             gram_models,
                                             0.5,
                                             logs=False), are_are_smoothed)

    def test_trigram_smoothed_freq(self):
        gram_models = [
            self.vocab._additive_smoothing_(gram_length=1,
                                            param=0,
                                            counts=False,
                                            logs=False),
            self.vocab._additive_smoothing_(gram_length=2,
                                            param=0,
                                            counts=False,
                                            logs=False),
            self.vocab._additive_smoothing_(gram_length=3,
                                            param=0,
                                            counts=False,
                                            logs=False)
        ]
        self.who_smoothed = [[0.5 * (3 / 8 + 1 / 8)], [0.5 * (0 + 1 / 8)]]
        self.are_smoothed = [[0.5 * (2 / 8 + 1 / 8)], [0.5 * (2 / 8 + 1 / 8)]]
        self.who_are_smoothed = [[
            0.5 * (1 / 7 + self.who_smoothed[0][0] * self.are_smoothed[0][0])
        ], [0.5 * (0 + self.who_smoothed[1][0] * self.are_smoothed[1][0])]]
        self.are_are_smoothed = [[
            0.5 * (1 / 7 + self.are_smoothed[0][0] * self.are_smoothed[0][0])
        ], [0.5 * (1 / 7 + self.are_smoothed[1][0] * self.are_smoothed[1][0])]]
        self.who_are_are_smoothed = [
            [
                0.5 *
                (1 / 6 +
                 self.who_are_smoothed[0][0] * self.are_are_smoothed[0][0])
            ],
            [
                0.5 *
                (0 + self.who_are_smoothed[1][0] * self.are_are_smoothed[1][0])
            ]
        ]

        self.assertListEqual(
            self.vocab._smoothed_gram_freqs_('who_are_are',
                                             gram_models,
                                             0.5,
                                             logs=False),
            self.who_are_are_smoothed)
class TestQuoraNgramGTFreq(unittest.TestCase):
    def setUp(self):
        comments = [[
            'xxxxx',
            ['where', 'where', 'where', 'are', 'are', 'you', 'you', 'now'], 1
        ],
                    [
                        'yyyyy',
                        [
                            'who', 'who', 'who', 'are', 'are', 'you', 'you',
                            'you'
                        ], 0
                    ]]
        self.vocab = NgramModel(comments)

    def tearDown(self):
        self.vocab = None

    def test_unigram_freq(self):
        self.vocab.good_turing(gram_length=1, param=0.5)
        unigram_freqs_target = [{
            'where': np.log((3 + 1) * (0.5 * 1) / (1) / 9),
            'are': np.log((2 + 1) * (1) / (2) / 9),
            'you': np.log((2 + 1) * (1) / (2) / 9),
            'now': np.log((1 + 1) * (2) / (2) / 9),
            '<unk>': np.log((1 + 1) * (2) / (2) / 9)
        }, {
            'who': np.log((3 + 1) * (0.5 * 2) / (2) / 9),
            'are': np.log((2 + 1) * (2) / (1) / 9),
            'you': np.log((3 + 1) * (0.5 * 2) / (2) / 9),
            '<unk>': np.log((1 + 1) * (1) / (1) / 9)
        }]
        self.vocab.train_classifier(gram_length=1, smoothing='good-turing')
        self.assertDictEqual(self.vocab.gram_frequency[0],
                             unigram_freqs_target[1])
        self.assertDictEqual(self.vocab.gram_frequency[1],
                             unigram_freqs_target[0])

    def test_bigram_freq(self):
        self.vocab.train_classifier(gram_length=2, smoothing='good-turing')
        bigram_freqs_target = [{
            'where_where':
            np.log((2 + 1) * (0.5 * 2) / (2) / 8),
            'where_are':
            np.log((1 + 1) * (1) / (6) / 8),
            'are_are':
            np.log((1 + 1) * (1) / (6) / 8),
            'are_you':
            np.log((1 + 1) * (1) / (6) / 8),
            'you_you':
            np.log((1 + 1) * (1) / (6) / 8),
            'you_now':
            np.log((1 + 1) * (1) / (6) / 8),
            '<unk>':
            np.log((1 + 1) * (1) / (6) / 8)
        }, {
            'who_who': np.log((2 + 1) * (0.5 * 2) / (2) / 8),
            'who_are': np.log((1 + 1) * (2) / (4) / 8),
            'are_are': np.log((1 + 1) * (2) / (4) / 8),
            'are_you': np.log((1 + 1) * (2) / (4) / 8),
            'you_you': np.log((2 + 1) * (0.5 * 2) / (2) / 8),
            '<unk>': np.log((1 + 1) * (2) / (4) / 8)
        }]
        self.assertDictEqual(self.vocab.gram_frequency[0],
                             bigram_freqs_target[1])
        self.assertDictEqual(self.vocab.gram_frequency[1],
                             bigram_freqs_target[0])