Esempio n. 1
0
def process_reading(reading):
    hiragana = common.to_hiragana(reading)
    hiragana = hiragana.split(".")[0].strip("-")
    generated_readings = {hiragana}

    def generate_extra_readings(hiragana):
        # handle sokuon
        if hiragana[-1] in "きくちつ":
            yield hiragana[:-1] + "っ"
        # handle rendaku
        if hiragana[0] in "かきくけこさしすせそたちつてとはひふへほ":
            yield chr(ord(hiragana[0]) + 1) + hiragana[1:]
        # handle handakuten for ha row
        if hiragana[0] in "はひふへほ":
            yield chr(ord(hiragana[0]) + 2) + hiragana[1:]
        # handle homophones for dzi, dzu
        if hiragana[0] in "ちつ":
            yield {"ち": "じ", "つ": "ず"}[hiragana[0]] + hiragana[1:]

    old = set()

    while generated_readings != old:
        old = generated_readings
        generated_readings = generated_readings | {new_reading
                                                   for old_reading in generated_readings
                                                   for new_reading in generate_extra_readings(old_reading)}

    return generated_readings
Esempio n. 2
0
 def process_furigana(kanji, kana):
     if (common.to_hiragana(kanji) != kana
             and len(kanji) != 0
             and all(common.is_kanji(k) or ord('0') <= ord(k) <= ord('9') for k in kanji)):
         return split_reading(kanji, kana, return_score=True)
     else:
         return [(kanji, None)], sum(NO_RUBY_PENALTY for k in kanji
                                     if common.is_kanji(k))
Esempio n. 3
0
def finalize_furigana(l, return_score=False):
    def process_furigana(kanji, kana):
        if (common.to_hiragana(kanji) != kana
                and len(kanji) != 0
                and all(common.is_kanji(k) or ord('0') <= ord(k) <= ord('9') for k in kanji)):
            return split_reading(kanji, kana, return_score=True)
        else:
            return [(kanji, None)], sum(NO_RUBY_PENALTY for k in kanji
                                        if common.is_kanji(k))

    nested_furigana, scores = zip(*(process_furigana(kanji, common.to_hiragana(kana))
                                    for kanji, kana in l))
    furigana = [pair
                for nested in nested_furigana
                for pair in nested]
    total_score = sum(scores) + KANA_MISMATCH_PENALTY * \
        Levenshtein.distance(common.to_hiragana("".join(kana for _, kana in l)),
                             common.to_hiragana("".join(kana if kana is not None else kanji
                                     for kanji, kana in furigana)))

    return (furigana, total_score) if return_score else furigana