def test_make_furi_token(self): s = deko.parse('友達') # f = ttlig.mctoken_to_furi(s[0]) f = ttlig.RubyToken.from_furi(s[0].text, s[0].reading_hira) self.assertEqual(f.to_code(), '{友達/ともだち}') # half-width char s = deko.parse('0') f = ttlig.RubyToken.from_furi(s[0].text, s[0].reading_hira) self.assertEqual(f.to_code(), '0')
def test_dekomecab(self): # try parsing text using mecab binary self.assertRaises( FileNotFoundError, lambda: parse(txt, mecab_loc='/usr/bin/path/to/mecab-binary-app')) self.assertRaises( FileNotFoundError, lambda: analyse( txt, mecab_loc='/usr/bin/path/to/mecab-binary-app')) self.assertRaises( FileNotFoundError, lambda: deko.parse(txt, mecab_loc='/usr/bin/path/to/mecab-binary-app')) self.assertRaises( FileNotFoundError, lambda: deko.parse_doc( txt, mecab_loc='/usr/bin/path/to/mecab-binary-app'))
def test_analyse_multiple_sents(self): sent = parse(txt4) expected_tokens = [ '猫', 'が', '好き', 'です', '。', '犬', 'も', '好き', 'です', '。', '鳥', 'は' ] tokens = list(sent.tokens.values()) self.assertEqual(tokens, expected_tokens) # check reading readings = [tk.reading_hira for tk in sent] expected_readings = [ 'ねこ', 'が', 'すき', 'です', '。', 'いぬ', 'も', 'すき', 'です', '。', 'とり', 'は' ] self.assertEqual(readings, expected_readings) # try tokenizing sentences doc = deko.parse_doc(txt4, splitlines=False) expected = [['猫が好きです。', ['猫', 'が', '好き', 'です', '。']], ['犬も好きです。', ['犬', 'も', '好き', 'です', '。']], ['鳥は', ['鳥', 'は']]] actual = [[sent.text, list(sent.tokens.values())] for sent in doc] self.assertEqual(expected, actual) # try tokenize text to sentences sents = deko.tokenize_sent(txt4) expected = ['猫が好きです。', '犬も好きです。', '鳥は'] self.assertEqual(expected, sents)
def text_to_igrow(txt): ''' Parse text to TTLIG format ''' sent = deko.parse(txt) return ttl_to_igrow(sent)
def test_func_alias(self): sent = deko.parse(txt) self.assertEqual(sent.words, ['雨', 'が', '降る', '。']) doc = deko.parse_doc(txt3, splitlines=False) self.assertEqual(len(doc), 3)
def test_func_alias(self): sent = parse(txt) self.assertEqual(sent.tokens.values(), ['雨', 'が', '降る', '。']) doc = parse_doc(txt3, splitlines=False) self.assertEqual(len(doc), 3)
def test_deko_ttl(self): sent = parse(txt) sj = sent.to_dict() expected = { 'text': '雨が降る。', 'tokens': [{ 'cfrom': 0, 'cto': 1, 'lemma': '雨', 'pos': '名詞', 'tags': [{ 'type': 'sc1', 'value': '一般' }, { 'type': 'pos3', 'value': '名詞-一般' }, { 'type': 'reading_hira', 'value': 'あめ' }], 'text': '雨' }, { 'cfrom': 1, 'cto': 2, 'lemma': 'が', 'pos': '助詞', 'tags': [{ 'type': 'sc1', 'value': '格助詞' }, { 'type': 'sc2', 'value': '一般' }, { 'type': 'pos3', 'value': '助詞-格助詞-一般' }, { 'type': 'reading_hira', 'value': 'が' }], 'text': 'が' }, { 'cfrom': 2, 'cto': 4, 'lemma': '降る', 'pos': '動詞', 'tags': [{ 'type': 'sc1', 'value': '自立' }, { 'type': 'inf', 'value': '五段・ラ行' }, { 'type': 'conj', 'value': '基本形' }, { 'type': 'pos3', 'value': '動詞-自立' }, { 'type': 'reading_hira', 'value': 'ふる' }], 'text': '降る' }, { 'cfrom': 4, 'cto': 5, 'lemma': '。', 'pos': '記号', 'tags': [{ 'type': 'sc1', 'value': '句点' }, { 'type': 'pos3', 'value': '記号-句点' }, { 'type': 'reading_hira', 'value': '。' }], 'text': '。' }] } self.assertEqual(sj, expected) # test doc to ttl ttl_doc = deko.parse_doc(txt3, splitlines=True) self.assertEqual(len(ttl_doc), 3) for sent, sent_text in zip(ttl_doc, txt3.splitlines()): tokens = deko.tokenize(sent_text, use_wakati=True) self.assertEqual(sent.text, sent_text) self.assertEqual(tokens, list(sent.tokens.values()))
def test_pos(self): sent = parse(txt) poses = [tk.pos3 for tk in sent] self.assertEqual(poses, ['名詞-一般', '助詞-格助詞-一般', '動詞-自立', '記号-句点'])
def test_mecab_lines(self): out = parse(txt2) self.assertGreaterEqual(len(out), 10) # EOS is removed automatically
def test_mecab(self): sent = parse(txt) self.assertEqual(['雨', 'が', '降る', '。'], list(sent.tokens.values()))