def _multicut( text: str, custom_dict: Trie = DEFAULT_WORD_DICT_TRIE ) -> Iterator[LatticeString]: """Return LatticeString""" if not custom_dict: custom_dict = DEFAULT_WORD_DICT_TRIE len_text = len(text) words_at = defaultdict(list) # main data structure def serialize(p, p2): # helper function for w in words_at[p]: p_ = p + len(w) if p_ == p2: yield w elif p_ < p2: for path in serialize(p_, p2): yield w + "/" + path q = {0} last_p = 0 # last position for yield while min(q) < len_text: p = min(q) q -= {p} # q.pop, but for set for w in custom_dict.prefixes(text[p:]): words_at[p].append(w) q.add(p + len(w)) len_q = len(q) if len_q == 1: q0 = min(q) yield LatticeString(text[last_p:q0], serialize(last_p, q0)) last_p = q0 elif len_q == 0: # len(q) == 0 means not found in dictionary m = _PAT_NONTHAI.match(text[p:]) if m: # non-Thai toekn i = p + m.span()[1] else: # non-Thai token, find minimum skip for i in range(p, len_text): ww = custom_dict.prefixes(text[i:]) m = _PAT_NONTHAI.match(text[i:]) if ww or m: break else: i = len_text w = text[p:i] words_at[p].append(w) yield LatticeString(w, in_dict=False) last_p = i q.add(i)
def revise_newmm_default_wordset( training_data: Iterable[Iterable[str]], ) -> Set[str]: """ Revise a set of word that could improve tokenization performance of `pythainlp.tokenize.newmm`, a dictionary-based tokenizer and a default tokenizer for PyThaiNLP. Words from `pythainlp.corpus.thai_words()` will be used as a base set for the dictionary. Words that do not performed well with `training_data` will be removed. The remaining words will be returned. :param Iterable[Iterable[str]] training_data: tokenized text, to be used\ as a training set :return: words that considered making `tokenize` perform unwell :rtype: Set[str] """ orig_words = thai_words() trie = Trie(orig_words) def tokenize(text): return newmm.segment(text, custom_dict=trie) revised_words = revise_wordset(tokenize, orig_words, training_data) return revised_words
def test_trie(self): self.assertIsNotNone(Trie([])) self.assertIsNotNone(Trie(["ทดสอบ", "ทด", "ทอด", "ทอผ้า"])) self.assertIsNotNone(Trie({"ทอด", "ทอง", "ทาง"})) self.assertIsNotNone(Trie(("ทอด", "ทอง", "ทาง"))) self.assertIsNotNone(Trie(Trie(["ทดสอบ", "ทดลอง"]))) trie = Trie(["ทด", "ทดสอบ", "ทดลอง"]) self.assertIn("ทด", trie) trie.add("ทบ") self.assertEqual(len(trie), 4) self.assertEqual(len(trie.prefixes("ทดสอบ")), 2) trie.remove("ทบ") trie.remove("ทด") self.assertEqual(len(trie), 2) trie = Trie([]) self.assertEqual(len(trie), 0) trie.remove("หมด") self.assertEqual(len(trie), 0) self.assertIsNotNone(dict_trie(Trie(["ลอง", "ลาก"]))) self.assertIsNotNone(dict_trie(("ลอง", "สร้าง", "Trie", "ลน"))) self.assertIsNotNone(dict_trie(["ลอง", "สร้าง", "Trie", "ลน"])) self.assertIsNotNone(dict_trie({"ลอง", "สร้าง", "Trie", "ลน"})) self.assertIsNotNone(dict_trie(thai_words())) self.assertIsNotNone( dict_trie(os.path.join(_CORPUS_PATH, _THAI_WORDS_FILENAME))) with self.assertRaises(TypeError): dict_trie("") with self.assertRaises(TypeError): dict_trie(None) with self.assertRaises(TypeError): dict_trie(42)
def _onecut(text: str, custom_dict: Trie) -> Generator[str, None, None]: # main data structure: # - key is begin position (int) # - value is possible end positions (List[int]) # if key is not found, value is empty list graph = defaultdict(list) graph_size = 0 # keep track of graph size, if too big will force cutoff valid_poss = tcc_pos(text) # breaking positions that are TCC-valid len_text = len(text) pos_list = [0] # priority queue of possible breaking positions end_pos = 0 while pos_list[0] < len_text: begin_pos = heappop(pos_list) for word in custom_dict.prefixes(text[begin_pos:]): end_pos_candidate = begin_pos + len(word) if end_pos_candidate in valid_poss: graph[begin_pos].append(end_pos_candidate) graph_size = graph_size + 1 if end_pos_candidate not in pos_list: heappush(pos_list, end_pos_candidate) if graph_size > _MAX_GRAPH_SIZE: break len_pos_list = len(pos_list) if len_pos_list == 1: # one candidate, no longer ambiguous end_pos_candidates = next( _bfs_paths_graph(graph, end_pos, pos_list[0])) graph_size = 0 for pos in end_pos_candidates[1:]: yield text[end_pos:pos] end_pos = pos elif len_pos_list == 0: # no candidate, deal with non-dictionary word m = _PAT_NONTHAI.match(text[begin_pos:]) if m: # non-Thai token, skip to the end end_pos = begin_pos + m.end() else: # Thai token, find minimum skip for pos in range(begin_pos + 1, len_text): if pos in valid_poss: prefix = text[pos:] words = [ word for word in custom_dict.prefixes(prefix) if ((pos + len(word) in valid_poss) and not _PAT_THAI_TWOCHARS.match(word)) ] if words: # is a Thai token that longer than 2 chars end_pos = pos break # is a non-Thai token if _PAT_NONTHAI.match(prefix): end_pos = pos break else: end_pos = len_text graph[begin_pos].append(end_pos) graph_size = graph_size + 1 yield text[begin_pos:end_pos] heappush(pos_list, end_pos)