def change_case(knp_lines, prev_case_katakana, new_case_katakana, verb_chunk_ind): chunk_line = knp_lines[verb_chunk_ind] chunk_num = get_chunk_num(chunk_line) #もし同じ項が2つあったときには、後ろ側(より述語に近い方)を取る (ないと思うけど…) #例: (変な文) 山を海を愛す → 「海を」 prev_case_chunks = [(mod_chunk_ind, mod_chunk_line) for (mod_chunk_ind, mod_chunk_line) in get_mod_chunk_and_mod_chunk_ind_lst(knp_lines, chunk_num) if (("<係:%s格>" % prev_case_katakana) in mod_chunk_line)] #なかったら、空リストを返す if len(prev_case_chunks) == 0: return [] prev_case_chunk_ind, prev_case_chunk_line = prev_case_chunks[-1] #条件に一致する行(格助詞のトークン行)を変換する。 #トークン行で、prev_case_chunk_indより後ろで、verb_chunk_indより前のもので、「prev_case」に一致する行をnew_caseに直す? #他の行はそのまま返す def loc_map_func(ind, line, prev_case_chunk_ind, verb_chunk_ind, prev_case_hiragana, new_case_hiragana): if prev_case_chunk_ind < ind < verb_chunk_ind: if is_token(line) and line.startswith('%s %s %s 助詞 9 格助詞' % (prev_case_hiragana, prev_case_hiragana, prev_case_hiragana)): return "%s %s %s 助詞 9 格助詞 1 * 0 * 0 NIL" % (new_case_hiragana, new_case_hiragana, new_case_hiragana) else: return line else: return line prev_case_hiragana = jctconv.kata2hira(prev_case_katakana.decode('utf-8')).encode('utf-8') new_case_hiragana = jctconv.kata2hira(new_case_katakana.decode('utf-8')).encode('utf-8') tmp_knp_lines = map(lambda (ind, line): loc_map_func(ind, line, prev_case_chunk_ind, verb_chunk_ind, prev_case_hiragana, new_case_hiragana), enumerate(knp_lines)) return [line for line in tmp_knp_lines if is_token(line)]
def toHiragana(word): if isKatakana(word) or isHiragana(word): return jctconv.kata2hira(unicode(word, 'utf-8')).encode('utf-8') mecab = MeCab.Tagger() m = mecab.parse(word) recode = m.split() if recode[0] == 'EOS': return None recode = recode[1].split(',') if len(recode) < 8: return None unicode_recode = jctconv.kata2hira(unicode(recode[7], 'utf-8')) return unicode_recode.encode('utf-8')
def kana2hira(strings): """ 全角カタカナを全角ひらがなに変換する その他の文字はそのまま """ strings = MultiBytes.convert2unicode(strings) return jctconv.kata2hira(strings)
def add_phonetic(token): if re.search('[0-9a-zA-Zあ-んア-ン]', token.surface): # 漢字以外が含まれている場合 result = disassembly(token) return result[0] + '|' + result[1] + '(' + result[2] + ')' + result[3] else: return '|' + token.surface + '(' + jctconv.kata2hira( token.reading) + ')'
def translate(self, src): n = self.mecab.parseToNode(src) text = '' pre_node = None while n: f = n.feature.split(',') if n.surface in converter: text += converter[n.surface] elif len(f) > 8: gobi = self._get_gobi(n) if gobi is not None: text += gobi elif f[8] != '*': text += f[8] else: text += n.surface else: text += n.surface if self._check_san(n): text += 'さん' elif self._check_separator(n): text += '、' n = n.next pre_node = n return jctconv.kata2hira(text.decode('utf-8')).encode('utf-8')
def get_kanji_part_yomi(surface, reading): '''Get yomi of kanji part of the string. Assume only one kanji part.''' nkidx = non_kanji_pos(surface) if nkidx is None: return None non_kanji = surface[nkidx[0]:nkidx[1]] yomi = re.sub(jctconv.hira2kata(non_kanji), '', reading) # case of two separated hiragana part surface2 = surface[nkidx[1]:] nkidx2 = non_kanji_pos(surface2) if nkidx2 is None: return jctconv.kata2hira(yomi) non_kanji2 = surface2[nkidx2[0]:nkidx2[1]] yomi = re.sub(jctconv.hira2kata(non_kanji2), '', yomi) return jctconv.kata2hira(yomi)
def Kanji2Hiragana(message): text = re.sub(':.*:', '', message) text = re.sub('\n', '', text) p1 = subprocess.Popen(['echo', text], stdout=subprocess.PIPE) p2 = subprocess.Popen(['mecab', '-Oyomi'], stdin=p1.stdout, stdout=subprocess.PIPE) p1.stdout.close() output = p2.communicate()[0] return jctconv.kata2hira(re.sub(r'\n', '', output.decode('utf-8')))
def format_arg_case(case): ans = "" #'hoge:同格未格'の場合がある #「人間以外の被造物に対してお辞儀などしてはならない。」 if "未格" in case: ans = 'は' else: case_unicode = case.replace('格', '').decode('utf-8') #'格'の文字は取る ans = jctconv.kata2hira(case_unicode).encode('utf-8') return ans
def preprocesseJct(words): new_words = copy.deepcopy(words) mecab = MeCab.Tagger() for w in words: m = mecab.parse(w) recode = m.split() if recode[0] == 'EOS': continue recode = recode[1].split(',') if len(recode) < 8: continue new_words.append(recode[7]) unicode_recode = jctconv.kata2hira(unicode(recode[7], 'utf-8')) new_words.append(unicode_recode.encode('utf-8')) new_words = list(set(new_words)) return new_words
def disassembly(token): chinese_flag = False before_kana = '' chinese = '' feed = '' read = jctconv.kata2hira(token.reading) for i in token.surface: if re.search('[\u4E00-\u9FD0]', i): chinese = chinese + i chinese_flag = True elif chinese_flag: feed = feed + i elif not chinese_flag: # 漢字の前にある、送り仮名以外の文字 before_kana = before_kana + i read = re.sub(feed, '', read) read = re.sub(before_kana, '', read) return before_kana, chinese, read, feed
def chunk_with_hira(istr, keep_katakana=False): t = Tokenizer() tokens = t.tokenize(istr) readings = [x.reading.decode('utf-8') for x in tokens] surfaces = [x.surface for x in tokens] pos = [] for token in tokens: p = token.part_of_speech.split(',')[0] if isinstance(p, unicode): pos.append(p) else: pos.append(p.decode('utf-8')) pos2 = [] for token in tokens: p = token.part_of_speech.split(',')[1] if isinstance(p, unicode): pos2.append(p) else: pos2.append(p.decode('utf-8')) rstr = u'' for i, z in enumerate(zip(readings, surfaces, pos, pos2)): r, s, p, p2 = z if r == u'*': if not keep_katakana: if re.match(TOKENS_KATAKANA, s): rstr += jctconv.kata2hira(s) + u' ' else: rstr += s + u' ' else: rstr += s + u' ' continue if i < len(pos) - 1: if pos[i] == u'助動詞' and pos[i+1] == u'助動詞': rstr += jctconv.kata2hira(r) elif pos[i] == u'動詞' and pos[i+1] == u'助動詞': rstr += jctconv.kata2hira(r) elif pos[i] == u'動詞' and pos[i+1] == u'助詞': rstr += jctconv.kata2hira(r) elif pos[i] == u'動詞' and pos[i+1] == u'動詞': rstr += jctconv.kata2hira(r) elif pos[i] == u'接頭詞' and pos[i+1] == u'名詞': rstr += jctconv.kata2hira(r) elif pos2[i] == u'代名詞' and pos2[i+1] == u'副助詞/並立助詞/終助詞': rstr += jctconv.kata2hira(r) elif pos2[i+1] == u'接尾': rstr += jctconv.kata2hira(r) else: rstr += jctconv.kata2hira(r) + u' ' elif i < len(pos) - 2: if pos[i] == u'助動詞' and pos[i+1] == u'助詞' and pos[i+2] == u'助詞': rstr += jctconv.kata2hira(r) elif pos[i] == u'助動詞' and pos[i+1] == u'名詞' and pos[i+2] == u'助動詞': rstr += jctconv.kata2hira(r) else: rstr += jctconv.kata2hira(r) + u' ' else: rstr += jctconv.kata2hira(r) + u' ' return rstr
def chunk_with_hira(istr, keep_katakana=False): t = Tokenizer() tokens = t.tokenize(istr) readings = [] for token in tokens: reading = token.reading readings.append(reading) surfaces = [x.surface for x in tokens] pos = [] for token in tokens: p = token.part_of_speech.split(',')[0] pos.append(p) pos2 = [] for token in tokens: p = token.part_of_speech.split(',')[1] pos2.append(p) rstr = '' for i, z in enumerate(zip(readings, surfaces, pos, pos2)): r, s, p, p2 = z if r == '*': if not keep_katakana: if re.match(TOKENS_KATAKANA, s): rstr += jctconv.kata2hira(s) + ' ' else: rstr += s + ' ' else: rstr += s + ' ' continue if i < len(pos) - 1: if pos[i] == '助動詞' and pos[i+1] == '助動詞': rstr += jctconv.kata2hira(r) elif pos[i] == '動詞' and pos[i+1] == '助動詞': rstr += jctconv.kata2hira(r) elif pos[i] == '動詞' and pos[i+1] == '助詞': rstr += jctconv.kata2hira(r) elif pos[i] == '動詞' and pos[i+1] == '動詞': rstr += jctconv.kata2hira(r) elif pos[i] == '接頭詞' and pos[i+1] == '名詞': rstr += jctconv.kata2hira(r) elif pos2[i] == '代名詞' and pos2[i+1] == '副助詞/並立助詞/終助詞': rstr += jctconv.kata2hira(r) elif pos2[i+1] == '接尾': rstr += jctconv.kata2hira(r) else: rstr += jctconv.kata2hira(r) + ' ' elif i < len(pos) - 2: if pos[i] == '助動詞' and pos[i+1] == '助詞' and pos[i+2] == '助詞': rstr += jctconv.kata2hira(r) elif pos[i] == '助動詞' and pos[i+1] == '名詞' and pos[i+2] == '助動詞': rstr += jctconv.kata2hira(r) else: rstr += jctconv.kata2hira(r) + ' ' else: rstr += jctconv.kata2hira(r) + ' ' return rstr
def test_kata2hira(): assert_equal(jctconv.kata2hira('巴マミ'), '巴まみ') assert_equal(jctconv.kata2hira('マミサン', ignore='ン'), 'まみさン') _compare(jctconv.kata2hira, FULL_KANA, HIRAGANA)
def normalize_word(word): """ For identify same word """ word = jctconv.kata2hira(word) return word.lower()
def test_kata2hira(): assert_equal(jctconv.kata2hira(u'巴マミ'), u'巴まみ') assert_equal(jctconv.kata2hira(FULL_KANA), HIRAGANA)
} onyomis = {} onyomi_tsv = open('onyomi.tsv', 'r') for line in onyomi_tsv: fields = line.strip().split("\t") kanji = fields[0] if len(fields) < 2: onyomis[kanji] = [] continue ons = fields[1].split() ons = [jctconv.kata2hira(on) for on in ons] # should we use this information (bound readings) somehow? ons = [on.replace('-', '') for on in ons] ons = [on.strip() for on in ons] ons = list(set(ons)) onyomis[kanji] = ons kunyomis = {} kunyomi_tsv = open('kunyomi.tsv', 'r') for line in kunyomi_tsv: fields = line.strip().split("\t") kanji = fields[0] if len(fields) < 2: kunyomis[kanji] = [] continue