def test_tcc(self): self.assertEqual(tcc.segment(None), []) self.assertEqual(tcc.segment(""), []) self.assertEqual(tcc.segment("ประเทศไทย"), ["ป", "ระ", "เท", "ศ", "ไท", "ย"]) self.assertEqual(list(tcc.tcc("")), []) self.assertEqual(tcc.tcc_pos(""), set())
def _onecut(text: str, custom_dict: Trie) -> Generator[str, None, None]: # main data structure: # - key is begin position (int) # - value is possible end positions (List[int]) # if key is not found, value is empty list graph = defaultdict(list) graph_size = 0 # keep track of graph size, if too big will force cutoff valid_poss = tcc_pos(text) # breaking positions that are TCC-valid len_text = len(text) pos_list = [0] # priority queue of possible breaking positions end_pos = 0 while pos_list[0] < len_text: begin_pos = heappop(pos_list) for word in custom_dict.prefixes(text[begin_pos:]): end_pos_candidate = begin_pos + len(word) if end_pos_candidate in valid_poss: graph[begin_pos].append(end_pos_candidate) graph_size = graph_size + 1 if end_pos_candidate not in pos_list: heappush(pos_list, end_pos_candidate) if graph_size > _MAX_GRAPH_SIZE: break len_pos_list = len(pos_list) if len_pos_list == 1: # one candidate, no longer ambiguous end_pos_candidates = next( _bfs_paths_graph(graph, end_pos, pos_list[0])) graph_size = 0 for pos in end_pos_candidates[1:]: yield text[end_pos:pos] end_pos = pos elif len_pos_list == 0: # no candidate, deal with non-dictionary word m = _PAT_NONTHAI.match(text[begin_pos:]) if m: # non-Thai token, skip to the end end_pos = begin_pos + m.end() else: # Thai token, find minimum skip for pos in range(begin_pos + 1, len_text): if pos in valid_poss: prefix = text[pos:] words = [ word for word in custom_dict.prefixes(prefix) if ((pos + len(word) in valid_poss) and not _PAT_THAI_TWOCHARS.match(word)) ] if words: # is a Thai token that longer than 2 chars end_pos = pos break # is a non-Thai token if _PAT_NONTHAI.match(prefix): end_pos = pos break else: end_pos = len_text graph[begin_pos].append(end_pos) graph_size = graph_size + 1 yield text[begin_pos:end_pos] heappush(pos_list, end_pos)
# -*- coding: utf-8 -*- from pythainlp.tokenize import tcc print(tcc.tcc("ประเทศไทย")) # ป/ระ/เท/ศ/ไท/ย print(tcc.tcc_pos("ประเทศไทย")) # {1, 3, 5, 6, 8, 9} for ch in tcc.tcc_gen("ประเทศไทย"): # ป-ระ-เท-ศ-ไท-ย- print(ch, end='-')