def parse(text): phrase = [] words = [] for morph in cabocha_tagger.parseToString(text).split("\n"): if morph.strip() == "" or morph.strip() == "EOS": continue if morph.strip().split(" ")[0] == "*": if phrase: phrase_lyrics = get_phrase(phrase, phrase_info) phrase = [] for word in phrase_lyrics['word']: accent = [] for syllable in word['syllable']: accent.append(syllable['accent']) yomi = "_".join(get_yomi(word['info'].split(',')[-1])) if yomi == "*": yomi = "_".join(get_yomi(jctconv.hira2kata(word['sur']))) words.append("%s,%s,%s"%(','.join(word['info'].split(',')[:-1:]), yomi, '_'.join(accent))) phrase_info = morph.split(" ")[1::] else: phrase.append(morph) phrase_lyrics = get_phrase(phrase, phrase_info) for word in phrase_lyrics['word']: accent = [] for syllable in word['syllable']: accent.append(syllable['accent']) yomi = "_".join(get_yomi(word['info'].split(',')[-1])) if yomi == "*": yomi = "_".join(get_yomi(jctconv.hira2kata(word['sur']))) words.append("%s,%s,%s"%(','.join(word['info'].split(',')[:-1:]), yomi, '_'.join(accent))) return words
def add_verb(term, lemma, yomi): if is_godan_waonbin(lemma): suffixes = GODAN_SUFFIXES[0] if lemma.endswith('う') else GODAN_SUFFIXES[1] for (pos, suffix) in zip(GODAN_WAONBIN, suffixes): term = term[:-1] + suffix yomi = yomi[:-1] + jctconv.hira2kata(suffix) yield write_dic(pos, term, lemma, yomi) elif is_godan_magyou(lemma): term = lemma[:-1] yomi = yomi[:-1] for (suffix, pos) in GODAN_MA: new_term = term + suffix new_yomi = yomi + jctconv.hira2kata(suffix) yield write_dic(pos, new_term, lemma, new_yomi)
def translate(self, text): text = encodeMeCab(text) node = self._tagger.parseToNode(text) result = [] while node: if node.stat>=2: node = node.next continue surface = decodeMeCab(node.surface) yomi = surface features = decodeMeCab(node.feature).split(',') if node.stat==0: yomi = features[7] yomi = jctconv.hira2kata(yomi) if features[1]==u'数': number = u'' while True: surface = decodeMeCab(node.surface) features = decodeMeCab(node.feature).split(',') if features[1]!=u'数': break number += surface node = node.next number = self._ja2int(number) result.append(self._translate_int(number)) else: result.append(self._translate_node(surface, yomi, features)) node = node.next #長音と促音を変換 text = u''.join(result) text = self.re_long.sub(u'\\1\\1', text) text = self.re_ltu.sub(u'\\1\\1', text) return text
def hira2kana(strings): """ 全角ひらがなを全角カタカナに変換する その他の文字はそのまま http://d.hatena.ne.jp/mohayonao/20101213/1292237816 """ strings = MultiBytes.convert2unicode(strings) return jctconv.hira2kata(strings)
def get_yomi(word): word = re_symbol.sub('', word) katakana = jctconv.hira2kata(word) if re_katakana.match(katakana): return katakana katakana = ''.join(mecab.to_yomi(word)) if re_katakana.match(katakana): return katakana return '*'
def get_kanji_part_yomi(surface, reading): '''Get yomi of kanji part of the string. Assume only one kanji part.''' nkidx = non_kanji_pos(surface) if nkidx is None: return None non_kanji = surface[nkidx[0]:nkidx[1]] yomi = re.sub(jctconv.hira2kata(non_kanji), '', reading) # case of two separated hiragana part surface2 = surface[nkidx[1]:] nkidx2 = non_kanji_pos(surface2) if nkidx2 is None: return jctconv.kata2hira(yomi) non_kanji2 = surface2[nkidx2[0]:nkidx2[1]] yomi = re.sub(jctconv.hira2kata(non_kanji2), '', yomi) return jctconv.kata2hira(yomi)
def expand_string(string): """ input:文字列 output:拡張された文字列のリスト 当面はひらがなtoカタカナ """ ex_strings = [] #ひらがな to カタカナ ex_strings.append(jctconv.hira2kata(string)) return ex_strings
def add_shii(term, lemma, yomi): if term.endswith('しい'): term = term[:-1] elif not term.endswith('し'): term += 'し' if yomi.endswith('シイ'): yomi = yomi[:-1] elif not yomi.endswith('シ'): yomi += 'シ' for (suffix, pos) in SHII: new_term = term + suffix new_yomi = yomi + jctconv.hira2kata(suffix) yield write_dic(pos, new_term, lemma, new_yomi)
def convert(mozc_map, mozc_dir, output_dir): with open(os.path.join(output_dir, "mozc.csv"), "w") as out_fd: for f in glob.glob(os.path.join(mozc_dir, "src/data/dictionary_oss/dictionary*.txt")): with open(f) as in_fd: for l in in_fd: l = l.decode("utf8").strip().split("\t") (yomi, lid, rid, cost, surface) = l[:5] if lid not in mozc_map: continue (new_id, pos) = mozc_map[lid] yomi = jctconv.hira2kata(yomi) line = ",".join([surface, new_id, new_id, "0", pos, surface, yomi, yomi]) line += "\n" out_fd.write(line.encode("utf8", "replace"))
def parseline(line): obj = line.split('/') for word in obj[1:]: if word != '': word = word.split(';') if(len(word) > 1 and re.search('(名詞|組織名|地名|人名)',word[1])): hinshi = '名詞' # if(re.search('(人名)',word[1])): # hinshi = '人名' # if(re.search('(地名)',word[1])): # hinshi = '地名' keyword = word[0].replace(',','') yomi = obj[0].strip() kana = jctconv.hira2kata(yomi) return ('%s,0,0,%i,%s,%s,*,*,*,*,%s,%s,%s' % (keyword,__cost,hinshi,'一般',yomi,kana,kana))
def convert(mozc_map, mozc_dir, output_dir): with open(os.path.join(output_dir, 'mozc.csv'), 'w') as out_fd: for f in glob.glob( os.path.join(mozc_dir, 'src/data/dictionary_oss/dictionary*.txt')): with open(f) as in_fd: for l in in_fd: l = l.decode('utf8').strip().split('\t') (yomi, lid, rid, cost, surface) = l[:5] if lid not in mozc_map: continue (new_id, pos) = mozc_map[lid] yomi = jctconv.hira2kata(yomi) line = ','.join([ surface, new_id, new_id, '0', pos, surface, yomi, yomi ]) line += '\n' out_fd.write(line.encode('utf8', 'replace'))
def split_at_hiragana(surface, reading): '''Split at hiragana.''' kidx = kanji_pos(surface) if kidx is None: # only non-kanji return [surface, reading] rest = surface[kidx[1]:] kidx2 = kanji_pos(rest) if kidx2 is None: # only one kanji + hiragana return [surface, reading] hira = rest[:kidx2[0]] # multiple kanji if hira in ["々", "ヶ"]: # since this character is somehow defined as non-kanji return [surface, reading] m = re.search(r'(.*?' + jctconv.hira2kata(hira) + ')(.*)', reading) return [surface[:kidx[1]] + hira, m.group(1), surface[kidx[1] + 1:], m.group(2)]
def get_phrase(phrase, phrase_info): phrase_lyrics = {"sur": " ".join([w.split("\t")[0] for w in phrase]), "info": " ".join(phrase_info), "word":[]} accent = get_accent("".join([w.split("\t")[0] for w in phrase])) mora = 0 acc_idx = 0 for word in phrase: sur = word.split("\t")[0] if en_p.match(sur): word = parse_eng_word(sur) kana = word.split("\t")[1].split(",")[-1] kana = "".join(get_yomi(kana)) if kana == "*": kana = jctconv.hira2kata(sur) mora += get_mora(kana) kana = "q".join(kana.split("ッ")) kana = "".join(kana.split("ー")) kana = "N".join(kana.split("ン")) word_alpha = split_alpha(kana) word_lyrics = {"sur":sur, "kana":kana, "info":word, "syllable":[]} for w_a in word_alpha: syllable_lyrics = {} out_char = [w_a[0], w_a[1]] if out_char[1] == "*": syllable_lyrics["sur"] = w_a[0] syllable_lyrics["roma"] = w_a[1] syllable_lyrics["accent"] = "*" else: if len(accent) <= mora: syllable_lyrics["sur"] = w_a[0] syllable_lyrics["roma"] = w_a[1] syllable_lyrics["accent"] = accent[-1] else: syllable_lyrics["sur"] = w_a[0] syllable_lyrics["roma"] = w_a[1] if acc_idx+1 > len(accent): syllable_lyrics["accent"] = accent[-1] else: syllable_lyrics["accent"] = accent[acc_idx] acc_idx += 1 word_lyrics["syllable"].append(syllable_lyrics) phrase_lyrics["word"].append(word_lyrics) return phrase_lyrics
def test_hira2kata(): assert_equal(jctconv.hira2kata('ともえまみ'), 'トモエマミ') assert_equal(jctconv.hira2kata('まどまぎ', ignore='ど'), 'マどマギ') _compare(jctconv.hira2kata, HIRAGANA, FULL_KANA)
def test_hira2kata(): assert_equal(jctconv.hira2kata(u'ともえまみ'), u'トモエマミ') assert_equal(jctconv.hira2kata(HIRAGANA), FULL_KANA)
def normalize_yomi(self, yomi): yomi = jctconv.hira2kata(yomi) return yomi.replace('ウ゛', 'ヴ').replace(' ', '')
def normalize(s): s = jctconv.hira2kata(s).replace('・', '') s = re_symbol.sub('', s) return re_tyouon.sub('ー', s)