Ejemplo n.º 1
0
    def idioms(self, string, descriptive=False, verbose=False):
        '''Process each line in `idioms.txt`
        Each line is delimited by "===",
        and the left string is replaced by the right one.
        inp: input string.
        descriptive: not used.
        verbose: boolean.

        >>> idioms("지금 mp3 파일을 다운받고 있어요")
        지금 엠피쓰리 파일을 다운받고 있어요
        '''
        rule = "from idioms.txt"
        out = string

        for line in open(os.path.dirname(os.path.abspath(__file__)) +
                         "idioms.txt",
                         'r',
                         encoding="utf8"):
            line = line.split("#")[0].strip()
            if "===" in line:
                str1, str2 = line.split("===")
                out = re.sub(str1, str2, out)
        gloss(verbose, out, string, rule)

        return out
Ejemplo n.º 2
0
def link3(inp, descriptive=False, verbose=False):
    rule = rule_id2text["15"]
    out = inp

    pairs = [ ("ᆨᄋ", " ᄀ"),
              ("ᆩᄋ", " ᄁ"),
              ("ᆫᄋ", " ᄂ"),
              ("ᆮᄋ", " ᄃ"),
              ("ᆯᄋ", " ᄅ"),
              ("ᆷᄋ", " ᄆ"),
              ("ᆸᄋ", " ᄇ"),
              ("ᆺᄋ", " ᄉ"),
              ("ᆻᄋ", " ᄊ"),
              ("ᆽᄋ", " ᄌ"),
              ("ᆾᄋ", " ᄎ"),
              ("ᆿᄋ", " ᄏ"),
              ("ᇀᄋ", " ᄐ"),
              ("ᇁᄋ", " ᄑ"),

              ("ᆪ ᄋ", "ᆨ ᄊ"),
              ("ᆬ ᄋ", "ᆫ ᄌ"),
              ("ᆰ ᄋ", "ᆯ ᄀ"),
              ("ᆱ ᄋ", "ᆯ ᄆ"),
              ("ᆲ ᄋ", "ᆯ ᄇ"),
              ("ᆳ ᄋ", "ᆯ ᄊ"),
              ("ᆴ ᄋ", "ᆯ ᄐ"),
              ("ᆵ ᄋ", "ᆯ ᄑ"),
              ("ᆹ ᄋ", "ᆸ ᄊ") ]

    for str1, str2 in pairs:
        out = out.replace(str1, str2)

    gloss(verbose, out, inp, rule)
    return out
Ejemplo n.º 3
0
def jyeo(inp, descriptive=False, verbose=False):
    rule = rule_id2text["5.1"]
    # 일반적인 규칙으로 취급한다 by kyubyong

    out = re.sub("([ᄌᄍᄎ])ᅧ", r"\1ᅥ", inp)
    gloss(verbose, out, inp, rule)
    return out
Ejemplo n.º 4
0
def rieulgiyeok(inp, descriptive=False, verbose=False):
    rule = rule_id2text["11.1"]

    out = inp
    out = re.sub("ᆰ/P([ᄀᄁ])", r"ᆯᄁ", out)

    gloss(verbose, out, inp, rule)
    return out
Ejemplo n.º 5
0
def vowel_ui(inp, descriptive=False, verbose=False):
    rule = rule_id2text["5.4.1"]
    # 실제로 언중은 높은 확률로 단어의 첫음절 이외의 '의'는 [ㅣ]로 발음한다."""
    if descriptive:
        out = re.sub("(\Sᄋ)ᅴ", r"\1ᅵ", inp)
    else:
        out = inp
    gloss(verbose, out, inp, rule)
    return out
Ejemplo n.º 6
0
def josa_ui(inp, descriptive=False, verbose=False):
    rule = rule_id2text["5.4.2"]
    # 실제로 언중은 높은 확률로 조사 '의'는 [ㅔ]로 발음한다.
    if descriptive:
        out = re.sub("의/J", "에", inp)
    else:
        out = inp.replace("/J", "")
    gloss(verbose, out, inp, rule)
    return out
Ejemplo n.º 7
0
def balb(inp, descriptive=False, verbose=False):
    rule = rule_id2text["10.1"]
    out = inp
    syllable_final_or_consonants = "($|[^ᄋᄒ])"

    # exceptions
    out = re.sub(f"(바)ᆲ({syllable_final_or_consonants})", r"\1ᆸ\2", out)
    out = re.sub(f"(너)ᆲ([ᄌᄍ]ᅮ|[ᄃᄄ]ᅮ)", r"\1ᆸ\2", out)
    gloss(verbose, out, inp, rule)
    return out
Ejemplo n.º 8
0
def ye(inp, descriptive=False, verbose=False):
    rule = rule_id2text["5.2"]
    # 실제로 언중은 예, 녜, 셰, 쎼 이외의 'ㅖ'는 [ㅔ]로 발음한다. by kyubyong

    if descriptive:
        out = re.sub("([ᄀᄁᄃᄄㄹᄆᄇᄈᄌᄍᄎᄏᄐᄑᄒ])ᅨ", r"\1ᅦ", inp)
    else:
        out = inp
    gloss(verbose, out, inp, rule)
    return out
Ejemplo n.º 9
0
def rieulbieub(inp, descriptive=False, verbose=False):
    rule = rule_id2text["25"]
    out = inp

    out = re.sub("([ᆲᆴ])/Pᄀ", r"\1ᄁ", out)
    out = re.sub("([ᆲᆴ])/Pᄃ", r"\1ᄄ", out)
    out = re.sub("([ᆲᆴ])/Pᄉ", r"\1ᄊ", out)
    out = re.sub("([ᆲᆴ])/Pᄌ", r"\1ᄍ", out)

    gloss(verbose, out, inp, rule)
    return out
Ejemplo n.º 10
0
def jamo(inp, descriptive=False, verbose=False):
    rule = rule_id2text["16"]
    out = inp

    out = re.sub("([그])ᆮᄋ", r"\1ᄉ", out)
    out = re.sub("([으])[ᆽᆾᇀᇂ]ᄋ", r"\1ᄉ", out)
    out = re.sub("([으])[ᆿ]ᄋ", r"\1ᄀ", out)
    out = re.sub("([으])[ᇁ]ᄋ", r"\1ᄇ", out)

    gloss(verbose, out, inp, rule)
    return out
Ejemplo n.º 11
0
def link1(inp, descriptive=False, verbose=False):
    rule = rule_id2text["13"]
    out = inp

    pairs = [("ᆨᄋ", "ᄀ"), ("ᆩᄋ", "ᄁ"), ("ᆫᄋ", "ᄂ"), ("ᆮᄋ", "ᄃ"), ("ᆯᄋ", "ᄅ"),
             ("ᆷᄋ", "ᄆ"), ("ᆸᄋ", "ᄇ"), ("ᆺᄋ", "ᄉ"), ("ᆻᄋ", "ᄊ"), ("ᆽᄋ", "ᄌ"),
             ("ᆾᄋ", "ᄎ"), ("ᆿᄋ", "ᄏ"), ("ᇀᄋ", "ᄐ"), ("ᇁᄋ", "ᄑ")]
    for str1, str2 in pairs:
        out = out.replace(str1, str2)

    gloss(verbose, out, inp, rule)
    return out
Ejemplo n.º 12
0
def link4(inp, descriptive=False, verbose=False):
    rule = rule_id2text["12.4"]

    out = inp

    pairs = [("ᇂᄋ", "ᄋ"), ("ᆭᄋ", "ᄂ"), ("ᆶᄋ", "ᄅ")]

    for str1, str2 in pairs:
        out = out.replace(str1, str2)

    gloss(verbose, out, inp, rule)
    return out
Ejemplo n.º 13
0
def palatalize(inp, descriptive=False, verbose=False):
    rule = rule_id2text["17"]
    out = inp

    out = re.sub("ᆮᄋ([ᅵᅧ])", r"ᄌ\1", out)
    out = re.sub("ᇀᄋ([ᅵᅧ])", r"ᄎ\1", out)
    out = re.sub("ᆴᄋ([ᅵᅧ])", r"ᆯᄎ\1", out)

    out = re.sub("ᆮᄒ([ᅵ])", r"ᄎ\1", out)

    gloss(verbose, out, inp, rule)
    return out
Ejemplo n.º 14
0
def link2(inp, descriptive=False, verbose=False):
    rule = rule_id2text["14"]
    out = inp

    pairs = [("ᆪᄋ", "ᆨᄊ"), ("ᆬᄋ", "ᆫᄌ"), ("ᆰᄋ", "ᆯᄀ"), ("ᆱᄋ", "ᆯᄆ"),
             ("ᆲᄋ", "ᆯᄇ"), ("ᆳᄋ", "ᆯᄊ"), ("ᆴᄋ", "ᆯᄐ"), ("ᆵᄋ", "ᆯᄑ"),
             ("ᆹᄋ", "ᆸᄊ")]
    for str1, str2 in pairs:
        out = out.replace(str1, str2)

    gloss(verbose, out, inp, rule)
    return out
Ejemplo n.º 15
0
def verb_nieun(inp, descriptive=False, verbose=False):
    rule = rule_id2text["24"]
    out = inp

    pairs = [("([ᆫᆷ])/Pᄀ", r"\1ᄁ"), ("([ᆫᆷ])/Pᄃ", r"\1ᄄ"),
             ("([ᆫᆷ])/Pᄉ", r"\1ᄊ"), ("([ᆫᆷ])/Pᄌ", r"\1ᄍ"), ("ᆬ/Pᄀ", "ᆫᄁ"),
             ("ᆬ/Pᄃ", "ᆫᄄ"), ("ᆬ/Pᄉ", "ᆫᄊ"), ("ᆬ/Pᄌ", "ᆫᄍ"), ("ᆱ/Pᄀ", "ᆷᄁ"),
             ("ᆱ/Pᄃ", "ᆷᄄ"), ("ᆱ/Pᄉ", "ᆷᄊ"), ("ᆱ/Pᄌ", "ᆷᄍ")]

    for str1, str2 in pairs:
        out = re.sub(str1, str2, out)

    gloss(verbose, out, inp, rule)
    return out
Ejemplo n.º 16
0
def modifying_rieul(inp, descriptive=False, verbose=False):
    rule = rule_id2text["27"]
    out = inp

    pairs = [("ᆯ/E ᄀ", r"ᆯ ᄁ"), ("ᆯ/E ᄃ", r"ᆯ ᄄ"), ("ᆯ/E ᄇ", r"ᆯ ᄈ"),
             ("ᆯ/E ᄉ", r"ᆯ ᄊ"), ("ᆯ/E ᄌ", r"ᆯ ᄍ"), ("ᆯ걸", "ᆯ껄"),
             ("ᆯ밖에", "ᆯ빠께"), ("ᆯ세라", "ᆯ쎄라"), ("ᆯ수록", "ᆯ쑤록"),
             ("ᆯ지라도", "ᆯ찌라도"), ("ᆯ지언정", "ᆯ찌언정"),
             ("ᆯ진대", "ᆯ찐대")]

    for str1, str2 in pairs:
        out = re.sub(str1, str2, out)

    gloss(verbose, out, inp, rule)
    return out
Ejemplo n.º 17
0
    def __call__(self,
                 string,
                 descriptive=False,
                 verbose=False,
                 group_vowels=False,
                 to_syl=True):
        '''Main function
        string: input string
        descriptive: boolean.
        verbose: boolean
        group_vowels: boolean. If True, the vowels of the identical sound are normalized.
        to_syl: boolean. If True, hangul letters or jamo are assembled to form syllables.

        For example, given an input string "나의 친구가 mp3 file 3개를 다운받고 있다",
        STEP 1. idioms
        -> 나의 친구가 엠피쓰리 file 3개를 다운받고 있다

        STEP 2. English to Hangul
        -> 나의 친구가 엠피쓰리 파일 3개를 다운받고 있다

        STEP 3. annotate
        -> 나의/J 친구가 엠피쓰리 파일 3개/B를 다운받고 있다

        STEP 4. Spell out arabic numbers
        -> 나의/J 친구가 엠피쓰리 파일 세개/B를 다운받고 있다

        STEP 5. decompose
        -> 나의/J 친구가 엠피쓰리 파일 세개/B를 다운받고 있다

        STEP 6-9. Hangul
        -> 나의 친구가 엠피쓰리 파일 세개를 다운받꼬 읻따
        '''
        # 1. idioms
        string = self.idioms(string, descriptive, verbose)

        # 2 English to Hangul
        string = convert_eng(string, self.cmu)

        # 3. annotate
        string = annotate(string, self.mecab)

        # 4. Spell out arabic numbers
        string = convert_num(string)

        # 5. decompose
        inp = h2j(string)

        # 6. special
        for func in (jyeo, ye, consonant_ui, josa_ui, vowel_ui, \
                     jamo, rieulgiyeok, rieulbieub, verb_nieun, \
                     balb, palatalize, modifying_rieul):
            inp = func(inp, descriptive, verbose)
        inp = re.sub("/[PJEB]", "", inp)

        # 7. regular table: batchim + onset
        for str1, str2, rule_ids in self.table:
            _inp = inp
            inp = re.sub(str1, str2, inp)

            if len(rule_ids) > 0:
                rule = "\n".join(
                    self.rule2text.get(rule_id, "") for rule_id in rule_ids)
            else:
                rule = ""
            gloss(verbose, inp, _inp, rule)

        # 8 link
        for func in (link1, link2, link3, link4):
            inp = func(inp, descriptive, verbose)

        # 9. postprocessing
        if group_vowels:
            inp = group(inp)

        if to_syl:
            inp = compose(inp)
        return inp
Ejemplo n.º 18
0
def consonant_ui(inp, descriptive=False, verbose=False):
    rule = rule_id2text["5.3"]

    out = re.sub("([ᄀᄁᄂᄃᄄᄅᄆᄇᄈᄉᄊᄌᄍᄎᄏᄐᄑᄒ])ᅴ", r"\1ᅵ", inp)
    gloss(verbose, out, inp, rule)
    return out