def change_case(knp_lines, prev_case_katakana, new_case_katakana, verb_chunk_ind):
    chunk_line = knp_lines[verb_chunk_ind]
    chunk_num = get_chunk_num(chunk_line)

    #もし同じ項が2つあったときには、後ろ側(より述語に近い方)を取る (ないと思うけど…)
    #例: (変な文) 山を海を愛す → 「海を」
    prev_case_chunks = [(mod_chunk_ind, mod_chunk_line) for (mod_chunk_ind, mod_chunk_line) in get_mod_chunk_and_mod_chunk_ind_lst(knp_lines, chunk_num) if (("<係:%s格>" % prev_case_katakana) in mod_chunk_line)]

    #なかったら、空リストを返す
    if len(prev_case_chunks) == 0:
        return []

    prev_case_chunk_ind, prev_case_chunk_line = prev_case_chunks[-1]

    #条件に一致する行(格助詞のトークン行)を変換する。
    #トークン行で、prev_case_chunk_indより後ろで、verb_chunk_indより前のもので、「prev_case」に一致する行をnew_caseに直す?
    #他の行はそのまま返す
    def loc_map_func(ind, line, prev_case_chunk_ind, verb_chunk_ind, prev_case_hiragana, new_case_hiragana):
        if prev_case_chunk_ind < ind < verb_chunk_ind:
            if is_token(line) and line.startswith('%s %s %s 助詞 9 格助詞' % (prev_case_hiragana, prev_case_hiragana, prev_case_hiragana)):
                return "%s %s %s 助詞 9 格助詞 1 * 0 * 0 NIL" % (new_case_hiragana, new_case_hiragana, new_case_hiragana)
            else:
                return line
        else:
            return line


    prev_case_hiragana =  jctconv.kata2hira(prev_case_katakana.decode('utf-8')).encode('utf-8')
    new_case_hiragana =  jctconv.kata2hira(new_case_katakana.decode('utf-8')).encode('utf-8')

    tmp_knp_lines = map(lambda (ind, line): loc_map_func(ind, line, prev_case_chunk_ind, verb_chunk_ind, prev_case_hiragana, new_case_hiragana), enumerate(knp_lines))

    return [line for line in tmp_knp_lines if is_token(line)]
Example #2
0
def toHiragana(word):
    if isKatakana(word) or isHiragana(word):
        return jctconv.kata2hira(unicode(word, 'utf-8')).encode('utf-8')

    mecab = MeCab.Tagger()
    m = mecab.parse(word)
    recode = m.split()
    if recode[0] == 'EOS':
        return None
    recode = recode[1].split(',')
    if len(recode) < 8:
        return None
    unicode_recode = jctconv.kata2hira(unicode(recode[7], 'utf-8'))
    return unicode_recode.encode('utf-8')
Example #3
0
 def kana2hira(strings):
     """
     全角カタカナを全角ひらがなに変換する
     その他の文字はそのまま
     """
     strings = MultiBytes.convert2unicode(strings)
     return jctconv.kata2hira(strings)
Example #4
0
def add_phonetic(token):
    if re.search('[0-9a-zA-Zあ-んア-ン]', token.surface):  # 漢字以外が含まれている場合
        result = disassembly(token)
        return result[0] + '|' + result[1] + '(' + result[2] + ')' + result[3]
    else:
        return '|' + token.surface + '(' + jctconv.kata2hira(
            token.reading) + ')'
 def translate(self, src):
     n = self.mecab.parseToNode(src)
     text = ''
     pre_node = None
     while n:
         f = n.feature.split(',')
         if n.surface in converter:
             text += converter[n.surface]
         elif len(f) > 8:
             gobi = self._get_gobi(n)
             if gobi is not None:
                 text += gobi
             elif f[8] != '*':
                 text += f[8]
             else:
                 text += n.surface
         else:
             text += n.surface
         if self._check_san(n):
             text += 'さん'
         elif self._check_separator(n):
             text += '、'
         n = n.next
         pre_node = n
     return jctconv.kata2hira(text.decode('utf-8')).encode('utf-8')
Example #6
0
def get_kanji_part_yomi(surface, reading):
    '''Get yomi of kanji part of the string. Assume only one kanji part.'''
    nkidx = non_kanji_pos(surface)
    if nkidx is None:
        return None

    non_kanji = surface[nkidx[0]:nkidx[1]]
    yomi = re.sub(jctconv.hira2kata(non_kanji), '', reading)

    # case of two separated hiragana part
    surface2 = surface[nkidx[1]:]
    nkidx2 = non_kanji_pos(surface2)
    if nkidx2 is None:
        return jctconv.kata2hira(yomi)

    non_kanji2 = surface2[nkidx2[0]:nkidx2[1]]
    yomi = re.sub(jctconv.hira2kata(non_kanji2), '', yomi)

    return jctconv.kata2hira(yomi)
Example #7
0
def Kanji2Hiragana(message):
    text = re.sub(':.*:', '', message)
    text = re.sub('\n', '', text)
    p1 = subprocess.Popen(['echo', text], stdout=subprocess.PIPE)
    p2 = subprocess.Popen(['mecab', '-Oyomi'],
                          stdin=p1.stdout,
                          stdout=subprocess.PIPE)
    p1.stdout.close()
    output = p2.communicate()[0]
    return jctconv.kata2hira(re.sub(r'\n', '', output.decode('utf-8')))
def format_arg_case(case):
    ans = ""

    #'hoge:同格未格'の場合がある
    #「人間以外の被造物に対してお辞儀などしてはならない。」
    if "未格" in case:
        ans = 'は'
    else:
        case_unicode = case.replace('格', '').decode('utf-8') #'格'の文字は取る
        ans = jctconv.kata2hira(case_unicode).encode('utf-8')

    return ans
Example #9
0
def preprocesseJct(words):
    new_words = copy.deepcopy(words)
    mecab = MeCab.Tagger()
    for w in words:
        m = mecab.parse(w)
        recode = m.split()
        if recode[0] == 'EOS': continue
        recode = recode[1].split(',')
        if len(recode) < 8: continue
        new_words.append(recode[7])
        unicode_recode = jctconv.kata2hira(unicode(recode[7], 'utf-8'))
        new_words.append(unicode_recode.encode('utf-8'))

    new_words = list(set(new_words))
    return new_words
Example #10
0
def disassembly(token):
    chinese_flag = False
    before_kana = ''
    chinese = ''
    feed = ''
    read = jctconv.kata2hira(token.reading)
    for i in token.surface:
        if re.search('[\u4E00-\u9FD0]', i):
            chinese = chinese + i
            chinese_flag = True
        elif chinese_flag:
            feed = feed + i
        elif not chinese_flag:  # 漢字の前にある、送り仮名以外の文字
            before_kana = before_kana + i
    read = re.sub(feed, '', read)
    read = re.sub(before_kana, '', read)
    return before_kana, chinese, read, feed
Example #11
0
def chunk_with_hira(istr, keep_katakana=False):
    t = Tokenizer()
    tokens = t.tokenize(istr)

    readings = [x.reading.decode('utf-8') for x in tokens]
    surfaces = [x.surface for x in tokens]

    pos = []
    for token in tokens:
        p = token.part_of_speech.split(',')[0]
        if isinstance(p, unicode):
            pos.append(p)
        else:
            pos.append(p.decode('utf-8'))

    pos2 = []
    for token in tokens:
        p = token.part_of_speech.split(',')[1]
        if isinstance(p, unicode):
            pos2.append(p)
        else:
            pos2.append(p.decode('utf-8'))

    rstr = u''
    for i, z in enumerate(zip(readings, surfaces, pos, pos2)):
        r, s, p, p2 = z

        if r == u'*':
            if not keep_katakana:
                if re.match(TOKENS_KATAKANA, s):
                    rstr += jctconv.kata2hira(s) + u' '
                else:
                    rstr += s + u' '
            else:
                rstr += s + u' '
            continue

        if i < len(pos) - 1:
            if pos[i] == u'助動詞' and pos[i+1] == u'助動詞':
                rstr += jctconv.kata2hira(r)
            elif pos[i] == u'動詞' and pos[i+1] == u'助動詞':
                rstr += jctconv.kata2hira(r)
            elif pos[i] == u'動詞' and pos[i+1] == u'助詞':
                rstr += jctconv.kata2hira(r)
            elif pos[i] == u'動詞' and pos[i+1] == u'動詞':
                rstr += jctconv.kata2hira(r)
            elif pos[i] == u'接頭詞' and pos[i+1] == u'名詞':
                rstr += jctconv.kata2hira(r)
            elif pos2[i] == u'代名詞' and pos2[i+1] == u'副助詞/並立助詞/終助詞':
                rstr += jctconv.kata2hira(r)
            elif pos2[i+1] == u'接尾':
                rstr += jctconv.kata2hira(r)
            else:
                rstr += jctconv.kata2hira(r) + u' '

        elif i < len(pos) - 2:
            if pos[i] == u'助動詞' and pos[i+1] == u'助詞' and pos[i+2] == u'助詞':
                rstr += jctconv.kata2hira(r)
            elif pos[i] == u'助動詞' and pos[i+1] == u'名詞' and pos[i+2] == u'助動詞':
                rstr += jctconv.kata2hira(r)
            else:
                rstr += jctconv.kata2hira(r) + u' '

        else:
            rstr += jctconv.kata2hira(r) + u' '

    return rstr
Example #12
0
def chunk_with_hira(istr, keep_katakana=False):
    t = Tokenizer()
    tokens = t.tokenize(istr)

    readings = []
    for token in tokens:
        reading = token.reading
        readings.append(reading)

    surfaces = [x.surface for x in tokens]

    pos = []
    for token in tokens:
        p = token.part_of_speech.split(',')[0]
        pos.append(p)

    pos2 = []
    for token in tokens:
        p = token.part_of_speech.split(',')[1]
        pos2.append(p)

    rstr = ''
    for i, z in enumerate(zip(readings, surfaces, pos, pos2)):
        r, s, p, p2 = z

        if r == '*':
            if not keep_katakana:
                if re.match(TOKENS_KATAKANA, s):
                    rstr += jctconv.kata2hira(s) + ' '
                else:
                    rstr += s + ' '
            else:
                rstr += s + ' '
            continue

        if i < len(pos) - 1:
            if pos[i] == '助動詞' and pos[i+1] == '助動詞':
                rstr += jctconv.kata2hira(r)
            elif pos[i] == '動詞' and pos[i+1] == '助動詞':
                rstr += jctconv.kata2hira(r)
            elif pos[i] == '動詞' and pos[i+1] == '助詞':
                rstr += jctconv.kata2hira(r)
            elif pos[i] == '動詞' and pos[i+1] == '動詞':
                rstr += jctconv.kata2hira(r)
            elif pos[i] == '接頭詞' and pos[i+1] == '名詞':
                rstr += jctconv.kata2hira(r)
            elif pos2[i] == '代名詞' and pos2[i+1] == '副助詞/並立助詞/終助詞':
                rstr += jctconv.kata2hira(r)
            elif pos2[i+1] == '接尾':
                rstr += jctconv.kata2hira(r)
            else:
                rstr += jctconv.kata2hira(r) + ' '

        elif i < len(pos) - 2:
            if pos[i] == '助動詞' and pos[i+1] == '助詞' and pos[i+2] == '助詞':
                rstr += jctconv.kata2hira(r)
            elif pos[i] == '助動詞' and pos[i+1] == '名詞' and pos[i+2] == '助動詞':
                rstr += jctconv.kata2hira(r)
            else:
                rstr += jctconv.kata2hira(r) + ' '

        else:
            rstr += jctconv.kata2hira(r) + ' '

    return rstr
Example #13
0
def test_kata2hira():
    assert_equal(jctconv.kata2hira('巴マミ'), '巴まみ')
    assert_equal(jctconv.kata2hira('マミサン', ignore='ン'), 'まみさン')
    _compare(jctconv.kata2hira, FULL_KANA, HIRAGANA)
Example #14
0
def normalize_word(word):
    """
    For identify same word
    """
    word = jctconv.kata2hira(word)
    return word.lower()
Example #15
0
def test_kata2hira():
    assert_equal(jctconv.kata2hira(u'巴マミ'), u'巴まみ')
    assert_equal(jctconv.kata2hira(FULL_KANA), HIRAGANA)
Example #16
0
}



onyomis = {}
onyomi_tsv = open('onyomi.tsv', 'r')
for line in onyomi_tsv:
    fields = line.strip().split("\t")
    kanji = fields[0]

    if len(fields) < 2:
        onyomis[kanji] = []
        continue
        
    ons = fields[1].split()
    ons = [jctconv.kata2hira(on) for on in ons]
    # should we use this information (bound readings) somehow?
    ons = [on.replace('-', '') for on in ons]
    ons = [on.strip() for on in ons]
    ons = list(set(ons))
    onyomis[kanji] = ons

kunyomis = {}
kunyomi_tsv = open('kunyomi.tsv', 'r')
for line in kunyomi_tsv:
    fields = line.strip().split("\t")
    kanji = fields[0]

    if len(fields) < 2:
        kunyomis[kanji] = []
        continue