def idioms(self, string, descriptive=False, verbose=False): '''Process each line in `idioms.txt` Each line is delimited by "===", and the left string is replaced by the right one. inp: input string. descriptive: not used. verbose: boolean. >>> idioms("지금 mp3 파일을 다운받고 있어요") 지금 엠피쓰리 파일을 다운받고 있어요 ''' rule = "from idioms.txt" out = string for line in open(os.path.dirname(os.path.abspath(__file__)) + "idioms.txt", 'r', encoding="utf8"): line = line.split("#")[0].strip() if "===" in line: str1, str2 = line.split("===") out = re.sub(str1, str2, out) gloss(verbose, out, string, rule) return out
def link3(inp, descriptive=False, verbose=False): rule = rule_id2text["15"] out = inp pairs = [ ("ᆨᄋ", " ᄀ"), ("ᆩᄋ", " ᄁ"), ("ᆫᄋ", " ᄂ"), ("ᆮᄋ", " ᄃ"), ("ᆯᄋ", " ᄅ"), ("ᆷᄋ", " ᄆ"), ("ᆸᄋ", " ᄇ"), ("ᆺᄋ", " ᄉ"), ("ᆻᄋ", " ᄊ"), ("ᆽᄋ", " ᄌ"), ("ᆾᄋ", " ᄎ"), ("ᆿᄋ", " ᄏ"), ("ᇀᄋ", " ᄐ"), ("ᇁᄋ", " ᄑ"), ("ᆪ ᄋ", "ᆨ ᄊ"), ("ᆬ ᄋ", "ᆫ ᄌ"), ("ᆰ ᄋ", "ᆯ ᄀ"), ("ᆱ ᄋ", "ᆯ ᄆ"), ("ᆲ ᄋ", "ᆯ ᄇ"), ("ᆳ ᄋ", "ᆯ ᄊ"), ("ᆴ ᄋ", "ᆯ ᄐ"), ("ᆵ ᄋ", "ᆯ ᄑ"), ("ᆹ ᄋ", "ᆸ ᄊ") ] for str1, str2 in pairs: out = out.replace(str1, str2) gloss(verbose, out, inp, rule) return out
def jyeo(inp, descriptive=False, verbose=False): rule = rule_id2text["5.1"] # 일반적인 규칙으로 취급한다 by kyubyong out = re.sub("([ᄌᄍᄎ])ᅧ", r"\1ᅥ", inp) gloss(verbose, out, inp, rule) return out
def rieulgiyeok(inp, descriptive=False, verbose=False): rule = rule_id2text["11.1"] out = inp out = re.sub("ᆰ/P([ᄀᄁ])", r"ᆯᄁ", out) gloss(verbose, out, inp, rule) return out
def vowel_ui(inp, descriptive=False, verbose=False): rule = rule_id2text["5.4.1"] # 실제로 언중은 높은 확률로 단어의 첫음절 이외의 '의'는 [ㅣ]로 발음한다.""" if descriptive: out = re.sub("(\Sᄋ)ᅴ", r"\1ᅵ", inp) else: out = inp gloss(verbose, out, inp, rule) return out
def josa_ui(inp, descriptive=False, verbose=False): rule = rule_id2text["5.4.2"] # 실제로 언중은 높은 확률로 조사 '의'는 [ㅔ]로 발음한다. if descriptive: out = re.sub("의/J", "에", inp) else: out = inp.replace("/J", "") gloss(verbose, out, inp, rule) return out
def balb(inp, descriptive=False, verbose=False): rule = rule_id2text["10.1"] out = inp syllable_final_or_consonants = "($|[^ᄋᄒ])" # exceptions out = re.sub(f"(바)ᆲ({syllable_final_or_consonants})", r"\1ᆸ\2", out) out = re.sub(f"(너)ᆲ([ᄌᄍ]ᅮ|[ᄃᄄ]ᅮ)", r"\1ᆸ\2", out) gloss(verbose, out, inp, rule) return out
def ye(inp, descriptive=False, verbose=False): rule = rule_id2text["5.2"] # 실제로 언중은 예, 녜, 셰, 쎼 이외의 'ㅖ'는 [ㅔ]로 발음한다. by kyubyong if descriptive: out = re.sub("([ᄀᄁᄃᄄㄹᄆᄇᄈᄌᄍᄎᄏᄐᄑᄒ])ᅨ", r"\1ᅦ", inp) else: out = inp gloss(verbose, out, inp, rule) return out
def rieulbieub(inp, descriptive=False, verbose=False): rule = rule_id2text["25"] out = inp out = re.sub("([ᆲᆴ])/Pᄀ", r"\1ᄁ", out) out = re.sub("([ᆲᆴ])/Pᄃ", r"\1ᄄ", out) out = re.sub("([ᆲᆴ])/Pᄉ", r"\1ᄊ", out) out = re.sub("([ᆲᆴ])/Pᄌ", r"\1ᄍ", out) gloss(verbose, out, inp, rule) return out
def jamo(inp, descriptive=False, verbose=False): rule = rule_id2text["16"] out = inp out = re.sub("([그])ᆮᄋ", r"\1ᄉ", out) out = re.sub("([으])[ᆽᆾᇀᇂ]ᄋ", r"\1ᄉ", out) out = re.sub("([으])[ᆿ]ᄋ", r"\1ᄀ", out) out = re.sub("([으])[ᇁ]ᄋ", r"\1ᄇ", out) gloss(verbose, out, inp, rule) return out
def link1(inp, descriptive=False, verbose=False): rule = rule_id2text["13"] out = inp pairs = [("ᆨᄋ", "ᄀ"), ("ᆩᄋ", "ᄁ"), ("ᆫᄋ", "ᄂ"), ("ᆮᄋ", "ᄃ"), ("ᆯᄋ", "ᄅ"), ("ᆷᄋ", "ᄆ"), ("ᆸᄋ", "ᄇ"), ("ᆺᄋ", "ᄉ"), ("ᆻᄋ", "ᄊ"), ("ᆽᄋ", "ᄌ"), ("ᆾᄋ", "ᄎ"), ("ᆿᄋ", "ᄏ"), ("ᇀᄋ", "ᄐ"), ("ᇁᄋ", "ᄑ")] for str1, str2 in pairs: out = out.replace(str1, str2) gloss(verbose, out, inp, rule) return out
def link4(inp, descriptive=False, verbose=False): rule = rule_id2text["12.4"] out = inp pairs = [("ᇂᄋ", "ᄋ"), ("ᆭᄋ", "ᄂ"), ("ᆶᄋ", "ᄅ")] for str1, str2 in pairs: out = out.replace(str1, str2) gloss(verbose, out, inp, rule) return out
def palatalize(inp, descriptive=False, verbose=False): rule = rule_id2text["17"] out = inp out = re.sub("ᆮᄋ([ᅵᅧ])", r"ᄌ\1", out) out = re.sub("ᇀᄋ([ᅵᅧ])", r"ᄎ\1", out) out = re.sub("ᆴᄋ([ᅵᅧ])", r"ᆯᄎ\1", out) out = re.sub("ᆮᄒ([ᅵ])", r"ᄎ\1", out) gloss(verbose, out, inp, rule) return out
def link2(inp, descriptive=False, verbose=False): rule = rule_id2text["14"] out = inp pairs = [("ᆪᄋ", "ᆨᄊ"), ("ᆬᄋ", "ᆫᄌ"), ("ᆰᄋ", "ᆯᄀ"), ("ᆱᄋ", "ᆯᄆ"), ("ᆲᄋ", "ᆯᄇ"), ("ᆳᄋ", "ᆯᄊ"), ("ᆴᄋ", "ᆯᄐ"), ("ᆵᄋ", "ᆯᄑ"), ("ᆹᄋ", "ᆸᄊ")] for str1, str2 in pairs: out = out.replace(str1, str2) gloss(verbose, out, inp, rule) return out
def verb_nieun(inp, descriptive=False, verbose=False): rule = rule_id2text["24"] out = inp pairs = [("([ᆫᆷ])/Pᄀ", r"\1ᄁ"), ("([ᆫᆷ])/Pᄃ", r"\1ᄄ"), ("([ᆫᆷ])/Pᄉ", r"\1ᄊ"), ("([ᆫᆷ])/Pᄌ", r"\1ᄍ"), ("ᆬ/Pᄀ", "ᆫᄁ"), ("ᆬ/Pᄃ", "ᆫᄄ"), ("ᆬ/Pᄉ", "ᆫᄊ"), ("ᆬ/Pᄌ", "ᆫᄍ"), ("ᆱ/Pᄀ", "ᆷᄁ"), ("ᆱ/Pᄃ", "ᆷᄄ"), ("ᆱ/Pᄉ", "ᆷᄊ"), ("ᆱ/Pᄌ", "ᆷᄍ")] for str1, str2 in pairs: out = re.sub(str1, str2, out) gloss(verbose, out, inp, rule) return out
def modifying_rieul(inp, descriptive=False, verbose=False): rule = rule_id2text["27"] out = inp pairs = [("ᆯ/E ᄀ", r"ᆯ ᄁ"), ("ᆯ/E ᄃ", r"ᆯ ᄄ"), ("ᆯ/E ᄇ", r"ᆯ ᄈ"), ("ᆯ/E ᄉ", r"ᆯ ᄊ"), ("ᆯ/E ᄌ", r"ᆯ ᄍ"), ("ᆯ걸", "ᆯ껄"), ("ᆯ밖에", "ᆯ빠께"), ("ᆯ세라", "ᆯ쎄라"), ("ᆯ수록", "ᆯ쑤록"), ("ᆯ지라도", "ᆯ찌라도"), ("ᆯ지언정", "ᆯ찌언정"), ("ᆯ진대", "ᆯ찐대")] for str1, str2 in pairs: out = re.sub(str1, str2, out) gloss(verbose, out, inp, rule) return out
def __call__(self, string, descriptive=False, verbose=False, group_vowels=False, to_syl=True): '''Main function string: input string descriptive: boolean. verbose: boolean group_vowels: boolean. If True, the vowels of the identical sound are normalized. to_syl: boolean. If True, hangul letters or jamo are assembled to form syllables. For example, given an input string "나의 친구가 mp3 file 3개를 다운받고 있다", STEP 1. idioms -> 나의 친구가 엠피쓰리 file 3개를 다운받고 있다 STEP 2. English to Hangul -> 나의 친구가 엠피쓰리 파일 3개를 다운받고 있다 STEP 3. annotate -> 나의/J 친구가 엠피쓰리 파일 3개/B를 다운받고 있다 STEP 4. Spell out arabic numbers -> 나의/J 친구가 엠피쓰리 파일 세개/B를 다운받고 있다 STEP 5. decompose -> 나의/J 친구가 엠피쓰리 파일 세개/B를 다운받고 있다 STEP 6-9. Hangul -> 나의 친구가 엠피쓰리 파일 세개를 다운받꼬 읻따 ''' # 1. idioms string = self.idioms(string, descriptive, verbose) # 2 English to Hangul string = convert_eng(string, self.cmu) # 3. annotate string = annotate(string, self.mecab) # 4. Spell out arabic numbers string = convert_num(string) # 5. decompose inp = h2j(string) # 6. special for func in (jyeo, ye, consonant_ui, josa_ui, vowel_ui, \ jamo, rieulgiyeok, rieulbieub, verb_nieun, \ balb, palatalize, modifying_rieul): inp = func(inp, descriptive, verbose) inp = re.sub("/[PJEB]", "", inp) # 7. regular table: batchim + onset for str1, str2, rule_ids in self.table: _inp = inp inp = re.sub(str1, str2, inp) if len(rule_ids) > 0: rule = "\n".join( self.rule2text.get(rule_id, "") for rule_id in rule_ids) else: rule = "" gloss(verbose, inp, _inp, rule) # 8 link for func in (link1, link2, link3, link4): inp = func(inp, descriptive, verbose) # 9. postprocessing if group_vowels: inp = group(inp) if to_syl: inp = compose(inp) return inp
def consonant_ui(inp, descriptive=False, verbose=False): rule = rule_id2text["5.3"] out = re.sub("([ᄀᄁᄂᄃᄄᄅᄆᄇᄈᄉᄊᄌᄍᄎᄏᄐᄑᄒ])ᅴ", r"\1ᅵ", inp) gloss(verbose, out, inp, rule) return out