def process_reading(reading): hiragana = common.to_hiragana(reading) hiragana = hiragana.split(".")[0].strip("-") generated_readings = {hiragana} def generate_extra_readings(hiragana): # handle sokuon if hiragana[-1] in "きくちつ": yield hiragana[:-1] + "っ" # handle rendaku if hiragana[0] in "かきくけこさしすせそたちつてとはひふへほ": yield chr(ord(hiragana[0]) + 1) + hiragana[1:] # handle handakuten for ha row if hiragana[0] in "はひふへほ": yield chr(ord(hiragana[0]) + 2) + hiragana[1:] # handle homophones for dzi, dzu if hiragana[0] in "ちつ": yield {"ち": "じ", "つ": "ず"}[hiragana[0]] + hiragana[1:] old = set() while generated_readings != old: old = generated_readings generated_readings = generated_readings | {new_reading for old_reading in generated_readings for new_reading in generate_extra_readings(old_reading)} return generated_readings
def process_furigana(kanji, kana): if (common.to_hiragana(kanji) != kana and len(kanji) != 0 and all(common.is_kanji(k) or ord('0') <= ord(k) <= ord('9') for k in kanji)): return split_reading(kanji, kana, return_score=True) else: return [(kanji, None)], sum(NO_RUBY_PENALTY for k in kanji if common.is_kanji(k))
def finalize_furigana(l, return_score=False): def process_furigana(kanji, kana): if (common.to_hiragana(kanji) != kana and len(kanji) != 0 and all(common.is_kanji(k) or ord('0') <= ord(k) <= ord('9') for k in kanji)): return split_reading(kanji, kana, return_score=True) else: return [(kanji, None)], sum(NO_RUBY_PENALTY for k in kanji if common.is_kanji(k)) nested_furigana, scores = zip(*(process_furigana(kanji, common.to_hiragana(kana)) for kanji, kana in l)) furigana = [pair for nested in nested_furigana for pair in nested] total_score = sum(scores) + KANA_MISMATCH_PENALTY * \ Levenshtein.distance(common.to_hiragana("".join(kana for _, kana in l)), common.to_hiragana("".join(kana if kana is not None else kanji for kanji, kana in furigana))) return (furigana, total_score) if return_score else furigana