def test_is_hcj(self): """is_hcj tests Test if a single character is a HCJ character. HCJ is defined as the U+313x to U+318x block, sans two non-assigned code points. """ # Note: The chaeum filler U+3164 is not considered HCJ, but a special # character as defined in http://www.unicode.org/charts/PDF/U3130.pdf. valid_hcj = itertools.chain((chr(_) for _ in range(0x3131, 0x3164)), (chr(_) for _ in range(0x3165, 0x318f))) invalid_edge_cases = (chr(0x3130), chr(0x3164), chr(0x318f)) invalid_hangul = _get_random_hangul(20) invalid_other = "abABzyZY ,.:;~`―—–/!@#$%^&*()[]{}ᄀᄓᅡᅶᆨᇃᇿ" # Positive tests for _ in valid_hcj: assert jamo.is_hcj(_),\ ("Incorrectly decided U+{} " "was not hcj.").format(hex(ord(_))[2:]) # Negative tests for _ in itertools.chain(invalid_edge_cases, invalid_hangul, invalid_other): assert not jamo.is_hcj(_),\ ("Incorrectly decided U+{} " "was hcj.").format(hex(ord(_))[2:])
def combine(self, verb, ending, rule): if not rule: return [] stop, postfix, start = rule.split(",") stop = None if stop == "" else int(stop) start = None if start == "" else int(start) # STEP 1. Decompose verb verb = h2j(verb) # h: hangul syl. j: jamo # STEP 2. Slice 1 verb = verb[:stop] # STEP 3. Merge 2 and postfix wordform = verb + postfix # STEP 4. Decompose ending ending = h2j(ending) ending = "".join(hcj_to_jamo(char, "tail") if is_hcj(char) else char for char in ending) # STEP 5. Slice 4 ending = ending[start:] # STEP 6. Merge 3 and 5 wordform +="|" + ending # STEP 7. Compose 6 wordform = self.compose(wordform) return wordform
def tokenize(text, as_id=False, symbol_type=1, debug=False): j2hj, j2hcj, j2sj, j2shcj = load_symbols_1(), load_symbols_2( ), load_symbols_3(), load_symbols_4() text = normalize(text) pre_tokens = list(hangul_to_jamo(text)) pre_tokens = [ hcj_to_jamo(_, "lead") if is_hcj(_) else _ for _ in pre_tokens ] tokens = [] if symbol_type == 1: if debug: print(char_to_id_1) for token in pre_tokens: tokens += list(j2hj[token]) if as_id: return [char_to_id_1[token] for token in tokens] + [char_to_id_1[EOS]] else: return [token for token in tokens] + [EOS] elif symbol_type == 2: if debug: print(char_to_id_2) for token in pre_tokens: tokens += list(j2hcj[token]) if as_id: return [char_to_id_2[token] for token in tokens] + [char_to_id_2[EOS]] else: return [token for token in tokens] + [EOS] elif symbol_type == 3: if debug: print(char_to_id_3) for token in pre_tokens: tokens += list(j2sj[token]) if as_id: return [char_to_id_3[token] for token in tokens] + [char_to_id_3[EOS]] else: return [token for token in tokens] + [EOS] elif symbol_type == 4: if debug: print(char_to_id_4) for token in pre_tokens: tokens += list(j2shcj[token]) if as_id: return [char_to_id_4[token] for token in tokens] + [char_to_id_4[EOS]] else: return [token for token in tokens] + [EOS]
def inflect(verb, ending, rule): if not rule: return [] verb = h2j(verb) ending = h2j(ending) ending = "".join( hcj_to_jamo(char, "tail") if is_hcj(char) else char for char in ending) rules = rule[1:-1].split("/") forms = [] for rule in rules: end, insertion, start = rule.split(",") end = int(end) if not end == "" else 100 start = int(start) if not start == "" else 0 form = verb[:end] + insertion + ending[start:] form = j2syl(form) forms.append(form) return forms
def tokenize(text, as_id=False, symbol_type=1, debug=False): j2hj, j2hcj, j2sj, j2shcj = load_symbols_1(), load_symbols_2(), load_symbols_3(), load_symbols_4() text = normalize(text) pre_tokens = list(hangul_to_jamo(text)) pre_tokens = [hcj_to_jamo(_, "lead") if is_hcj(_) else _ for _ in pre_tokens] tokens = [] if symbol_type == 1: if debug: print(char_to_id_1) for token in pre_tokens: #token = token.encode('utf-8','ignore') token = token.replace('\u201d',' ') token = token.replace('\u2026',' ') token = token.replace('\u2018',' ') token = token.replace('\u201c',' ') token = token.replace('\u2019',' ') token = token.replace('\xe1\x84\x8b',' ') token = token.replace('\xb7',' ') token = token.replace('\xa0',' ') tokens += list(j2hj[token]) if as_id: return [char_to_id_1[token] for token in tokens] + [char_to_id_1[EOS]] else: return [token for token in tokens] + [EOS] elif symbol_type == 2: if debug: print(char_to_id_2) for token in pre_tokens: tokens += list(j2hcj[token]) if as_id: return [char_to_id_2[token] for token in tokens] + [char_to_id_2[EOS]] else: return [token for token in tokens] + [EOS] elif symbol_type == 3: if debug: print(char_to_id_3) for token in pre_tokens: tokens += list(j2sj[token]) if as_id: return [char_to_id_3[token] for token in tokens] + [char_to_id_3[EOS]] else: return [token for token in tokens] + [EOS] elif symbol_type == 4: if debug: print(char_to_id_4) for token in pre_tokens: tokens += list(j2shcj[token]) if as_id: return [char_to_id_4[token] for token in tokens] + [char_to_id_4[EOS]] else: return [token for token in tokens] + [EOS]