Beispiel #1
0
    def test_linebreak(self):
        ins = [u'今日は\n赤ちゃん', u'私が\rママよ']
        out = list(do_mecab_iter(ins, byline=False))
        self.assertEqual(len(out), 2, out)

        out = list(do_mecab_iter(ins, '-Owakati', byline=True))
        self.assertEqual(len(out), 2, out)
Beispiel #2
0
    def test_iter_Eopt_unicode(self):
        ins = [u'となりの客はよく柿食う客だ', u'バスガス爆発']
        ct = 0
        for line in do_mecab_iter(ins, u'-Eおしまい\n', byline=False):
            ct += 1
            self.assertEqual(line[-5:], u'\nおしまい')
        self.assertEqual(ct, 2)

        ct = 0
        for line in do_mecab_iter(ins, '-E', u'おしまい\n', byline=False):
            ct += 1
            self.assertEqual(line[-5:], u'\nおしまい')
        self.assertEqual(ct, 2)

        ct = 0
        for line in do_mecab_iter(ins, u'--eos-format=おしまい\n', byline=False):
            ct += 1
            self.assertEqual(line[-5:], u'\nおしまい')
        self.assertEqual(ct, 2)

        ct = 0
        for line in do_mecab_iter(ins, '--eos-format', u'おしまい\n',
                                  byline=False):
            ct += 1
            self.assertEqual(line[-5:], u'\nおしまい')
        self.assertEqual(ct, 2)
Beispiel #3
0
    def test_iter_Eopt(self):
        ins = [u'となりの客はよく柿食う客だ', u'バスガス爆発']
        ct = 0
        for line in do_mecab_iter(ins, '-EEND\n', byline=False):
            ct += 1
            self.assertEqual(line[-4:], '\nEND')
        self.assertEqual(ct, 2)

        ct = 0
        for line in do_mecab_iter(ins, '-E', 'END\n', byline=False):
            ct += 1
            self.assertEqual(line[-4:], '\nEND')
        self.assertEqual(ct, 2)

        ct = 0
        for line in do_mecab_iter(ins, '--eos-format=END\n', byline=False):
            ct += 1
            self.assertEqual(line[-4:], '\nEND')
        self.assertEqual(ct, 2)

        ct = 0
        for line in do_mecab_iter(ins, '--eos-format', 'END\n', byline=False):
            ct += 1
            self.assertEqual(line[-4:], '\nEND')
        self.assertEqual(ct, 2)
Beispiel #4
0
    def test_iter(self):
        ins = [u'アイスコーヒー', u'飲みたい']
        it = do_mecab_iter(ins, '-F%m\n', byline=True)
        self.assertTrue(isinstance(it, types.GeneratorType))
        self.assertEqual(list(it),
                         [u'アイス', u'コーヒー', u'EOS', u'飲み', u'たい', u'EOS'])

        ins = [u'ぶどうパン', u'食べたい']
        it = do_mecab_iter(ins, '-F%m\n', byline=False)
        self.assertTrue(isinstance(it, types.GeneratorType))
        self.assertEqual(list(it), [u'ぶどう\nパン\nEOS', u'食べ\nたい\nEOS'])
Beispiel #5
0
    def test_iter_count(self):
        ins = [u'となりの客はよく柿食う客だ', u'バスガス爆発']
        ct = 0
        for line in do_mecab_iter(ins, byline=False):
            ct += 1
        self.assertEqual(ct, 2)

        ct = 0
        for line in do_mecab_iter(ins, '-Owakati', byline=True):
            ct += 1
        self.assertEqual(ct, 2)

        ct = 0
        for line in do_mecab_iter(ins, '-Owakati', byline=False):
            ct += 1
        self.assertEqual(ct, 1)
Beispiel #6
0
    def test_large_input_iter(self):
        enc = detect_mecab_enc()
        x = u'隣の客はよく柿食う客かな?'
        bx = len(x.encode(enc))
        # repeat this until it is over 30000 bytes
        tgt = 30000
        rep = int(tgt / bx + 1)
        y = x * rep
        by = len(y.encode(enc))
        # make sure that the computation above works for all platform
        self.assertTrue(by > tgt)

        out1 = do_mecab_iter([y],
                             '-Owakati',
                             byline=True,
                             auto_buffer_size=True,
                             truncate=True)
        out1 = list(out1)
        out2 = do_mecab_iter([y],
                             '-Owakati',
                             byline=True,
                             auto_buffer_size=True,
                             truncate=False)
        out2 = list(out2)
        out3 = do_mecab_iter([y],
                             '-Owakati',
                             byline=True,
                             auto_buffer_size=False,
                             truncate=True)
        out3 = list(out3)
        with warnings.catch_warnings(record=True) as w:
            out4 = do_mecab_iter([y],
                                 '-Owakati',
                                 byline=True,
                                 auto_buffer_size=False,
                                 truncate=False)
            out4 = list(out4)
            self.assertEqual(len(w), 1, 'auto=False, trunc=False')

        # test of result length
        self.assertEqual(len(out1), 1)
        self.assertEqual(len(out2), 1)
        self.assertEqual(len(out3), 1)
        self.assertTrue(len(out4) > 1)

        # test of truncation
        def num_exclam(out):
            return len(re.findall(r'\?', ''.join(out)))

        self.assertEqual(num_exclam(out1), rep)
        self.assertEqual(num_exclam(out2), rep)
        self.assertTrue(num_exclam(out3) < rep)
        #self.assertEqual(num_exclam(out4), rep)
        # what happens is ambiguous

        # test of mannual -b option
        # if we set enough buffer size level, we should be fine
        out5 = do_mecab_iter([y],
                             '-Owakati',
                             '-b',
                             str(by + 1),
                             byline=True,
                             auto_buffer_size=False,
                             truncate=True)
        out5 = list(out5)
        out6 = do_mecab_iter([y],
                             '-Owakati',
                             '-b',
                             str(by + 1),
                             byline=True,
                             auto_buffer_size=False,
                             truncate=False)
        out6 = list(out6)
        self.assertEqual(len(out5), 1)
        self.assertEqual(len(out6), 1)
        self.assertEqual(num_exclam(out5), rep)
        self.assertEqual(num_exclam(out6), rep)

        # if the buffer size is small, we should get warning
        out7 = do_mecab_iter([y],
                             '-Owakati',
                             '-b',
                             str(by),
                             byline=True,
                             auto_buffer_size=False,
                             truncate=True)
        out7 = list(out7)
        with warnings.catch_warnings(record=True) as w:
            out8 = do_mecab_iter([y],
                                 '-Owakati',
                                 '-b',
                                 str(by),
                                 byline=True,
                                 auto_buffer_size=False,
                                 truncate=False)
            out8 = list(out8)
            self.assertEqual(len(w), 1, 'auto=False, trunc=False, -b small')
        self.assertEqual(len(out7), 1)
        self.assertTrue(len(out8) > 1)
        self.assertTrue(num_exclam(out7) < rep)
        #self.assertEqual(num_exclam(out8), rep)

        # if we set -b option and auto_buffer_size together,
        # we should get warning and we will use the auto size
        with warnings.catch_warnings(record=True) as w:
            out9 = do_mecab_iter([y],
                                 '-Owakati',
                                 '-b',
                                 str(by + 1),
                                 byline=True,
                                 auto_buffer_size=True)
            out9 = list(out9)
            self.assertEqual(len(w), 1, 'auto=False, trunc=False, -b small')
        self.assertEqual(len(out9), 1)
        self.assertEqual(num_exclam(out9), rep)

        # result equality
        self.assertEqual(out1, out2)
        self.assertEqual(out1, out5)
        self.assertEqual(out1, out6)
        self.assertEqual(out1, out9)
        # and inequality
        self.assertNotEqual(out1, out3)
        self.assertNotEqual(out1, out4)
        self.assertNotEqual(out1, out7)
        self.assertNotEqual(out1, out8)