Beispiel #1
0
    def test_is_hcj(self):
        """is_hcj tests
        Test if a single character is a HCJ character.
        HCJ is defined as the U+313x to U+318x block, sans two non-assigned
        code points.
        """

        # Note: The chaeum filler U+3164 is not considered HCJ, but a special
        # character as defined in http://www.unicode.org/charts/PDF/U3130.pdf.
        valid_hcj = itertools.chain((chr(_) for _ in range(0x3131, 0x3164)),
                                    (chr(_) for _ in range(0x3165, 0x318f)))

        invalid_edge_cases = (chr(0x3130), chr(0x3164), chr(0x318f))
        invalid_hangul = _get_random_hangul(20)
        invalid_other = "abABzyZY ,.:;~`―—–/!@#$%^&*()[]{}ᄀᄓᅡᅶᆨᇃᇿ"

        # Positive tests
        for _ in valid_hcj:
            assert jamo.is_hcj(_),\
                ("Incorrectly decided U+{} "
                 "was not hcj.").format(hex(ord(_))[2:])
        # Negative tests
        for _ in itertools.chain(invalid_edge_cases, invalid_hangul,
                                 invalid_other):
            assert not jamo.is_hcj(_),\
                ("Incorrectly decided U+{} "
                 "was hcj.").format(hex(ord(_))[2:])
Beispiel #2
0
    def test_is_hcj(self):
        """is_hcj tests
        Test if a single character is a HCJ character.
        HCJ is defined as the U+313x to U+318x block, sans two non-assigned
        code points.
        """

        # Note: The chaeum filler U+3164 is not considered HCJ, but a special
        # character as defined in http://www.unicode.org/charts/PDF/U3130.pdf.
        valid_hcj = itertools.chain((chr(_) for _ in range(0x3131, 0x3164)),
                                    (chr(_) for _ in range(0x3165, 0x318f)))

        invalid_edge_cases = (chr(0x3130), chr(0x3164), chr(0x318f))
        invalid_hangul = _get_random_hangul(20)
        invalid_other = "abABzyZY ,.:;~`―—–/!@#$%^&*()[]{}ᄀᄓᅡᅶᆨᇃᇿ"

        # Positive tests
        for _ in valid_hcj:
            assert jamo.is_hcj(_),\
                ("Incorrectly decided U+{} "
                 "was not hcj.").format(hex(ord(_))[2:])
        # Negative tests
        for _ in itertools.chain(invalid_edge_cases,
                                 invalid_hangul,
                                 invalid_other):
            assert not jamo.is_hcj(_),\
                ("Incorrectly decided U+{} "
                 "was hcj.").format(hex(ord(_))[2:])
Beispiel #3
0
    def combine(self, verb, ending, rule):
        if not rule:
            return []

        stop, postfix, start = rule.split(",")
        stop = None if stop == "" else int(stop)
        start = None if start == "" else int(start)

        # STEP 1. Decompose verb
        verb = h2j(verb) # h: hangul syl. j: jamo

        # STEP 2. Slice 1
        verb = verb[:stop]

        # STEP 3. Merge 2 and postfix
        wordform = verb + postfix

        # STEP 4. Decompose ending
        ending = h2j(ending)
        ending = "".join(hcj_to_jamo(char, "tail") if is_hcj(char) else char for char in ending)

        # STEP 5. Slice 4
        ending = ending[start:]

        # STEP 6. Merge 3 and 5
        wordform +="|" + ending

        # STEP 7. Compose 6
        wordform = self.compose(wordform)

        return wordform
Beispiel #4
0
def tokenize(text, as_id=False, symbol_type=1, debug=False):

    j2hj, j2hcj, j2sj, j2shcj = load_symbols_1(), load_symbols_2(
    ), load_symbols_3(), load_symbols_4()

    text = normalize(text)
    pre_tokens = list(hangul_to_jamo(text))
    pre_tokens = [
        hcj_to_jamo(_, "lead") if is_hcj(_) else _ for _ in pre_tokens
    ]
    tokens = []

    if symbol_type == 1:
        if debug:
            print(char_to_id_1)
        for token in pre_tokens:
            tokens += list(j2hj[token])

        if as_id:
            return [char_to_id_1[token]
                    for token in tokens] + [char_to_id_1[EOS]]
        else:
            return [token for token in tokens] + [EOS]

    elif symbol_type == 2:
        if debug:
            print(char_to_id_2)
        for token in pre_tokens:
            tokens += list(j2hcj[token])

        if as_id:
            return [char_to_id_2[token]
                    for token in tokens] + [char_to_id_2[EOS]]
        else:
            return [token for token in tokens] + [EOS]

    elif symbol_type == 3:
        if debug:
            print(char_to_id_3)
        for token in pre_tokens:
            tokens += list(j2sj[token])

        if as_id:
            return [char_to_id_3[token]
                    for token in tokens] + [char_to_id_3[EOS]]
        else:
            return [token for token in tokens] + [EOS]

    elif symbol_type == 4:
        if debug:
            print(char_to_id_4)
        for token in pre_tokens:
            tokens += list(j2shcj[token])

        if as_id:
            return [char_to_id_4[token]
                    for token in tokens] + [char_to_id_4[EOS]]
        else:
            return [token for token in tokens] + [EOS]
Beispiel #5
0
def inflect(verb, ending, rule):
    if not rule:
        return []
    verb = h2j(verb)
    ending = h2j(ending)
    ending = "".join(
        hcj_to_jamo(char, "tail") if is_hcj(char) else char for char in ending)
    rules = rule[1:-1].split("/")
    forms = []
    for rule in rules:
        end, insertion, start = rule.split(",")

        end = int(end) if not end == "" else 100
        start = int(start) if not start == "" else 0
        form = verb[:end] + insertion + ending[start:]
        form = j2syl(form)
        forms.append(form)
    return forms
Beispiel #6
0
def tokenize(text, as_id=False, symbol_type=1, debug=False):

    j2hj, j2hcj, j2sj, j2shcj = load_symbols_1(), load_symbols_2(), load_symbols_3(), load_symbols_4()

    text = normalize(text)
    pre_tokens = list(hangul_to_jamo(text))
    pre_tokens = [hcj_to_jamo(_, "lead") if is_hcj(_) else _ for _ in pre_tokens]
    tokens = []
    if symbol_type == 1:
        if debug:
            print(char_to_id_1)
        for token in pre_tokens:
            #token = token.encode('utf-8','ignore')
             
            token = token.replace('\u201d',' ')
            token = token.replace('\u2026',' ')
            token = token.replace('\u2018',' ')
            token = token.replace('\u201c',' ')
            token = token.replace('\u2019',' ')
            token = token.replace('\xe1\x84\x8b',' ') 	
            token = token.replace('\xb7',' ') 
            token = token.replace('\xa0',' ') 
            tokens += list(j2hj[token])
            
        if as_id:
            return [char_to_id_1[token] for token in tokens] + [char_to_id_1[EOS]]
        else:
            return [token for token in tokens] + [EOS]

    elif symbol_type == 2:
        if debug:
            print(char_to_id_2)
        for token in pre_tokens:
            tokens += list(j2hcj[token])

        if as_id:
            return [char_to_id_2[token] for token in tokens] + [char_to_id_2[EOS]]
        else:
            return [token for token in tokens] + [EOS]

    elif symbol_type == 3:
        if debug:
            print(char_to_id_3)
        for token in pre_tokens:
            tokens += list(j2sj[token])

        if as_id:
            return [char_to_id_3[token] for token in tokens] + [char_to_id_3[EOS]]
        else:
            return [token for token in tokens] + [EOS]

    elif symbol_type == 4:
        if debug:
            print(char_to_id_4)
        for token in pre_tokens:
            tokens += list(j2shcj[token])

        if as_id:
            return [char_to_id_4[token] for token in tokens] + [char_to_id_4[EOS]]
        else:
            return [token for token in tokens] + [EOS]