Exemple #1
0
def j2h(text):
    i = 0
    result = []
    while i < len(text):
        if i + 2 < len(text) and text[i] in LEADS and text[
                i + 1] in VOWELS and text[i + 2] in TAILS:
            result.append(jamo.j2h(text[i], text[i + 1], text[i + 2]))
            i += 3
        elif i + 1 < len(text) and text[i] in LEADS and text[i + 1] in VOWELS:
            result.append(jamo.j2h(text[i], text[i + 1]))
            i += 2
        else:
            result.append(text[i])
            i += 1
    return "".join(result)
Exemple #2
0
def _get_text_from_candidates(candidates):
    if len(candidates) == 0:
        return ""
    elif len(candidates) == 1:
        return _jamo_char_to_hcj(candidates[0])
    else:
        return j2h(**dict(zip(["lead", "vowel", "tail"], candidates)))
    def test_j2h(self):
        """j2h hardcoded tests.
        Arguments may be integers corresponding to the U+11xx codepoints, the
        actual U+11xx jamo characters, or HCJ.

        Outputs a one-character Hangul string.

        This function is defined solely for naming conisistency with
        jamo_to_hangul.
        """

        assert jamo.j2h('ㅎ', 'ㅏ', 'ㄴ') == "한",\
            "j2h doesn't work. Hint: it's the same as jamo_to_hangul."

        assert jamo.j2h('ㅎ', 'ㅏ') == "하",\
            "j2h doesn't work. Hint: it's the same as jamo_to_hangul."
Exemple #4
0
    def test_j2h(self):
        """j2h hardcoded tests.
        Arguments may be integers corresponding to the U+11xx codepoints, the
        actual U+11xx jamo characters, or HCJ.

        Outputs a one-character Hangul string.

        This function is defined solely for naming conisistency with
        jamo_to_hangul.
        """

        assert jamo.j2h('ㅎ', 'ㅏ', 'ㄴ') == "한",\
            "j2h doesn't work. Hint: it's the same as jamo_to_hangul."

        assert jamo.j2h('ㅎ', 'ㅏ') == "하",\
            "j2h doesn't work. Hint: it's the same as jamo_to_hangul."
Exemple #5
0
 def encode(self, text):
     text = self.preprocess(text, vowel_type=self.vowel_type)
     words = []
     for word in TreebankWordTokenizer().tokenize(text):
         from reo_toolkit import is_maori
         if not is_maori(word):
             words.append(word)
             continue
         encoded_text = []
         for syllable in self.tokenize(word):
             if not all(ch in alphabet for ch in syllable):
                 encoded_text.append(syllable)
                 continue
             if syllable in vowels:
                 syllable = 'x' + syllable
             try:
                 consonant, vowel = ''.join(
                     [self.encoder_dict[ch] for ch in syllable])
             except KeyError:
                 logging.error(
                     "KeyError: phoneme {} not in encoder_dict".format(
                         syllable))
                 raise KeyError
             try:
                 encoded = jamo.j2h(consonant, vowel)
             except jamo.InvalidJamoError:
                 logging.error(
                     'InvalidJamoError - Consonant={} Vowel={} Syllable={}'.
                     format(consonant, vowel, syllable))
             encoded_text.append(encoded)
         words.append(''.join(encoded_text))
     return TreebankWordDetokenizer().detokenize(words)
Exemple #6
0
def get_text_from_candidates(candidates):
    if len(candidates) == 0:
        return ""
    elif len(candidates) == 1:
        return _jamo_char_to_hcj(candidates[0])
    else:
        return j2h(**dict(zip(["chosung", "jungsung", "jongsung"], candidates)))
Exemple #7
0
def j2syl(string):
    choseong = "[\u1100-\u1112]"
    jungseong = "[\u1161-\u1175]"
    jongseong = "[\u11A8-\u11C2]"

    # CVC
    matches = re.findall(f"{choseong}{jungseong}{jongseong}", string)
    for match in matches:
        syl = j2h(*match)
        string = string.replace(match, syl)

    # CV
    matches = re.findall(f"{choseong}{jungseong}", string)
    for match in matches:
        syl = j2h(*match)
        string = string.replace(match, syl)

    return string
Exemple #8
0
def compose(letters):
    # insert placeholder
    letters = re.sub("(^|[^\u1100-\u1112])([\u1161-\u1175])", r"\1ᄋ\2",
                     letters)

    string = letters  # assembled characters
    # c+v+c
    syls = set(
        re.findall("[\u1100-\u1112][\u1161-\u1175][\u11A8-\u11C2]", string))
    for syl in syls:
        string = string.replace(syl, j2h(*syl))

    # c+v
    syls = set(re.findall("[\u1100-\u1112][\u1161-\u1175]", string))
    for syl in syls:
        string = string.replace(syl, j2h(*syl))

    return string
Exemple #9
0
    def compose(self, string):
        string = self.contract(string)

        choseong = "[\u1100-\u1112]"
        jungseong = "[\u1161-\u1175]"
        jongseong = "[\u11A8-\u11C2]"

        # CVC first
        matches = re.findall(f"{choseong}{jungseong}{jongseong}", string)
        for match in matches:
            syl = j2h(*match)
            string = string.replace(match, syl)

        # CV
        matches = re.findall(f"{choseong}{jungseong}", string)
        for match in matches:
            syl = j2h(*match)
            string = string.replace(match, syl)

        return string