def test_tcc(self):
     self.assertEqual(tcc.segment(None), [])
     self.assertEqual(tcc.segment(""), [])
     self.assertEqual(tcc.segment("ประเทศไทย"),
                      ["ป", "ระ", "เท", "ศ", "ไท", "ย"])
     self.assertEqual(list(tcc.tcc("")), [])
     self.assertEqual(tcc.tcc_pos(""), set())
Exemple #2
0
    def test_tcc(self):
        self.assertEqual(tcc.segment(None), [])
        self.assertEqual(tcc.segment(""), [])
        self.assertEqual(tcc.segment("ประเทศไทย"), ["ป", "ระ", "เท", "ศ", "ไท", "ย"])

        self.assertEqual(list(tcc.tcc("")), [])
        self.assertEqual(tcc.tcc_pos(""), set())
Exemple #3
0
def _onecut(text: str, custom_dict: Trie) -> Generator[str, None, None]:
    # main data structure:
    # - key is begin position (int)
    # - value is possible end positions (List[int])
    # if key is not found, value is empty list
    graph = defaultdict(list)

    graph_size = 0  # keep track of graph size, if too big will force cutoff

    valid_poss = tcc_pos(text)  # breaking positions that are TCC-valid

    len_text = len(text)
    pos_list = [0]  # priority queue of possible breaking positions
    end_pos = 0
    while pos_list[0] < len_text:
        begin_pos = heappop(pos_list)
        for word in custom_dict.prefixes(text[begin_pos:]):
            end_pos_candidate = begin_pos + len(word)
            if end_pos_candidate in valid_poss:
                graph[begin_pos].append(end_pos_candidate)
                graph_size = graph_size + 1

                if end_pos_candidate not in pos_list:
                    heappush(pos_list, end_pos_candidate)

                if graph_size > _MAX_GRAPH_SIZE:
                    break

        len_pos_list = len(pos_list)
        if len_pos_list == 1:  # one candidate, no longer ambiguous
            end_pos_candidates = next(
                _bfs_paths_graph(graph, end_pos, pos_list[0]))
            graph_size = 0
            for pos in end_pos_candidates[1:]:
                yield text[end_pos:pos]
                end_pos = pos
        elif len_pos_list == 0:  # no candidate, deal with non-dictionary word
            m = _PAT_NONTHAI.match(text[begin_pos:])
            if m:  # non-Thai token, skip to the end
                end_pos = begin_pos + m.end()
            else:  # Thai token, find minimum skip
                for pos in range(begin_pos + 1, len_text):
                    if pos in valid_poss:
                        prefix = text[pos:]
                        words = [
                            word for word in custom_dict.prefixes(prefix)
                            if ((pos + len(word) in valid_poss)
                                and not _PAT_THAI_TWOCHARS.match(word))
                        ]
                        if words:  # is a Thai token that longer than 2 chars
                            end_pos = pos
                            break

                        # is a non-Thai token
                        if _PAT_NONTHAI.match(prefix):
                            end_pos = pos
                            break
                else:
                    end_pos = len_text

            graph[begin_pos].append(end_pos)
            graph_size = graph_size + 1
            yield text[begin_pos:end_pos]
            heappush(pos_list, end_pos)
Exemple #4
0
# -*- coding: utf-8 -*-

from pythainlp.tokenize import tcc

print(tcc.tcc("ประเทศไทย"))  # ป/ระ/เท/ศ/ไท/ย

print(tcc.tcc_pos("ประเทศไทย"))  # {1, 3, 5, 6, 8, 9}

for ch in tcc.tcc_gen("ประเทศไทย"):  # ป-ระ-เท-ศ-ไท-ย-
    print(ch, end='-')