Beispiel #1
0
 def test_make_furi_token(self):
     s = deko.parse('友達')
     # f = ttlig.mctoken_to_furi(s[0])
     f = ttlig.RubyToken.from_furi(s[0].text, s[0].reading_hira)
     self.assertEqual(f.to_code(), '{友達/ともだち}')
     # half-width char
     s = deko.parse('0')
     f = ttlig.RubyToken.from_furi(s[0].text, s[0].reading_hira)
     self.assertEqual(f.to_code(), '0')
Beispiel #2
0
 def test_dekomecab(self):
     # try parsing text using mecab binary
     self.assertRaises(
         FileNotFoundError,
         lambda: parse(txt, mecab_loc='/usr/bin/path/to/mecab-binary-app'))
     self.assertRaises(
         FileNotFoundError, lambda: analyse(
             txt, mecab_loc='/usr/bin/path/to/mecab-binary-app'))
     self.assertRaises(
         FileNotFoundError,
         lambda: deko.parse(txt,
                            mecab_loc='/usr/bin/path/to/mecab-binary-app'))
     self.assertRaises(
         FileNotFoundError, lambda: deko.parse_doc(
             txt, mecab_loc='/usr/bin/path/to/mecab-binary-app'))
Beispiel #3
0
 def test_analyse_multiple_sents(self):
     sent = parse(txt4)
     expected_tokens = [
         '猫', 'が', '好き', 'です', '。', '犬', 'も', '好き', 'です', '。', '鳥', 'は'
     ]
     tokens = list(sent.tokens.values())
     self.assertEqual(tokens, expected_tokens)
     # check reading
     readings = [tk.reading_hira for tk in sent]
     expected_readings = [
         'ねこ', 'が', 'すき', 'です', '。', 'いぬ', 'も', 'すき', 'です', '。', 'とり', 'は'
     ]
     self.assertEqual(readings, expected_readings)
     # try tokenizing sentences
     doc = deko.parse_doc(txt4, splitlines=False)
     expected = [['猫が好きです。', ['猫', 'が', '好き', 'です', '。']],
                 ['犬も好きです。', ['犬', 'も', '好き', 'です', '。']],
                 ['鳥は', ['鳥', 'は']]]
     actual = [[sent.text, list(sent.tokens.values())] for sent in doc]
     self.assertEqual(expected, actual)
     # try tokenize text to sentences
     sents = deko.tokenize_sent(txt4)
     expected = ['猫が好きです。', '犬も好きです。', '鳥は']
     self.assertEqual(expected, sents)
Beispiel #4
0
def text_to_igrow(txt):
    ''' Parse text to TTLIG format '''
    sent = deko.parse(txt)
    return ttl_to_igrow(sent)
Beispiel #5
0
 def test_func_alias(self):
     sent = deko.parse(txt)
     self.assertEqual(sent.words, ['雨', 'が', '降る', '。'])
     doc = deko.parse_doc(txt3, splitlines=False)
     self.assertEqual(len(doc), 3)
Beispiel #6
0
 def test_func_alias(self):
     sent = parse(txt)
     self.assertEqual(sent.tokens.values(), ['雨', 'が', '降る', '。'])
     doc = parse_doc(txt3, splitlines=False)
     self.assertEqual(len(doc), 3)
Beispiel #7
0
    def test_deko_ttl(self):
        sent = parse(txt)
        sj = sent.to_dict()
        expected = {
            'text':
            '雨が降る。',
            'tokens': [{
                'cfrom':
                0,
                'cto':
                1,
                'lemma':
                '雨',
                'pos':
                '名詞',
                'tags': [{
                    'type': 'sc1',
                    'value': '一般'
                }, {
                    'type': 'pos3',
                    'value': '名詞-一般'
                }, {
                    'type': 'reading_hira',
                    'value': 'あめ'
                }],
                'text':
                '雨'
            }, {
                'cfrom':
                1,
                'cto':
                2,
                'lemma':
                'が',
                'pos':
                '助詞',
                'tags': [{
                    'type': 'sc1',
                    'value': '格助詞'
                }, {
                    'type': 'sc2',
                    'value': '一般'
                }, {
                    'type': 'pos3',
                    'value': '助詞-格助詞-一般'
                }, {
                    'type': 'reading_hira',
                    'value': 'が'
                }],
                'text':
                'が'
            }, {
                'cfrom':
                2,
                'cto':
                4,
                'lemma':
                '降る',
                'pos':
                '動詞',
                'tags': [{
                    'type': 'sc1',
                    'value': '自立'
                }, {
                    'type': 'inf',
                    'value': '五段・ラ行'
                }, {
                    'type': 'conj',
                    'value': '基本形'
                }, {
                    'type': 'pos3',
                    'value': '動詞-自立'
                }, {
                    'type': 'reading_hira',
                    'value': 'ふる'
                }],
                'text':
                '降る'
            }, {
                'cfrom':
                4,
                'cto':
                5,
                'lemma':
                '。',
                'pos':
                '記号',
                'tags': [{
                    'type': 'sc1',
                    'value': '句点'
                }, {
                    'type': 'pos3',
                    'value': '記号-句点'
                }, {
                    'type': 'reading_hira',
                    'value': '。'
                }],
                'text':
                '。'
            }]
        }
        self.assertEqual(sj, expected)
        # test doc to ttl
        ttl_doc = deko.parse_doc(txt3, splitlines=True)
        self.assertEqual(len(ttl_doc), 3)

        for sent, sent_text in zip(ttl_doc, txt3.splitlines()):
            tokens = deko.tokenize(sent_text, use_wakati=True)
            self.assertEqual(sent.text, sent_text)
            self.assertEqual(tokens, list(sent.tokens.values()))
Beispiel #8
0
 def test_pos(self):
     sent = parse(txt)
     poses = [tk.pos3 for tk in sent]
     self.assertEqual(poses, ['名詞-一般', '助詞-格助詞-一般', '動詞-自立', '記号-句点'])
Beispiel #9
0
 def test_mecab_lines(self):
     out = parse(txt2)
     self.assertGreaterEqual(len(out), 10)  # EOS is removed automatically
Beispiel #10
0
 def test_mecab(self):
     sent = parse(txt)
     self.assertEqual(['雨', 'が', '降る', '。'], list(sent.tokens.values()))