Python Trie Beispiele

Programmiersprache: Python

Namespace / Paketname: pythainlp.util

Klasse / Typ: Trie

Beispiele auf hotexamples.com: 4

Python Trie - 4 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die pythainlp.util.Trie, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Häufig verwendete Methoden

Anzeigen Verbergen

prefixes(3)

Trie(2)

add(1)

remove(1)

Häufig verwendete Methoden

prefixes (3)

Trie (2)

add (1)

remove (1)

Beispiel #1

Datei anzeigen

Datei: multi_cut.py Projekt: NoerNova/pythainlp

def _multicut(
    text: str, custom_dict: Trie = DEFAULT_WORD_DICT_TRIE
) -> Iterator[LatticeString]:
    """Return LatticeString"""
    if not custom_dict:
        custom_dict = DEFAULT_WORD_DICT_TRIE

    len_text = len(text)
    words_at = defaultdict(list)  # main data structure

    def serialize(p, p2):  # helper function
        for w in words_at[p]:
            p_ = p + len(w)
            if p_ == p2:
                yield w
            elif p_ < p2:
                for path in serialize(p_, p2):
                    yield w + "/" + path

    q = {0}
    last_p = 0  # last position for yield
    while min(q) < len_text:
        p = min(q)
        q -= {p}  # q.pop, but for set

        for w in custom_dict.prefixes(text[p:]):
            words_at[p].append(w)
            q.add(p + len(w))

        len_q = len(q)

        if len_q == 1:
            q0 = min(q)
            yield LatticeString(text[last_p:q0], serialize(last_p, q0))
            last_p = q0
        elif len_q == 0:  # len(q) == 0  means not found in dictionary
            m = _PAT_NONTHAI.match(text[p:])
            if m:  # non-Thai toekn
                i = p + m.span()[1]
            else:  # non-Thai token, find minimum skip
                for i in range(p, len_text):
                    ww = custom_dict.prefixes(text[i:])
                    m = _PAT_NONTHAI.match(text[i:])
                    if ww or m:
                        break
                else:
                    i = len_text
            w = text[p:i]
            words_at[p].append(w)
            yield LatticeString(w, in_dict=False)
            last_p = i
            q.add(i)

Beispiel #2

Datei anzeigen

def revise_newmm_default_wordset(
    training_data: Iterable[Iterable[str]], ) -> Set[str]:
    """
    Revise a set of word that could improve tokenization performance of
    `pythainlp.tokenize.newmm`, a dictionary-based tokenizer and a default
    tokenizer for PyThaiNLP.

    Words from `pythainlp.corpus.thai_words()` will be used as a base set
    for the dictionary. Words that do not performed well with `training_data`
    will be removed. The remaining words will be returned.

    :param Iterable[Iterable[str]] training_data: tokenized text, to be used\
        as a training set
    :return: words that considered making `tokenize` perform unwell
    :rtype: Set[str]
    """
    orig_words = thai_words()
    trie = Trie(orig_words)

    def tokenize(text):
        return newmm.segment(text, custom_dict=trie)

    revised_words = revise_wordset(tokenize, orig_words, training_data)
    return revised_words

Beispiel #3

Datei anzeigen

Datei: test_util.py Projekt: Aattawut/textcls-flaskapi-02

    def test_trie(self):
        self.assertIsNotNone(Trie([]))
        self.assertIsNotNone(Trie(["ทดสอบ", "ทด", "ทอด", "ทอผ้า"]))
        self.assertIsNotNone(Trie({"ทอด", "ทอง", "ทาง"}))
        self.assertIsNotNone(Trie(("ทอด", "ทอง", "ทาง")))
        self.assertIsNotNone(Trie(Trie(["ทดสอบ", "ทดลอง"])))

        trie = Trie(["ทด", "ทดสอบ", "ทดลอง"])
        self.assertIn("ทด", trie)
        trie.add("ทบ")
        self.assertEqual(len(trie), 4)
        self.assertEqual(len(trie.prefixes("ทดสอบ")), 2)

        trie.remove("ทบ")
        trie.remove("ทด")
        self.assertEqual(len(trie), 2)

        trie = Trie([])
        self.assertEqual(len(trie), 0)
        trie.remove("หมด")
        self.assertEqual(len(trie), 0)

        self.assertIsNotNone(dict_trie(Trie(["ลอง", "ลาก"])))
        self.assertIsNotNone(dict_trie(("ลอง", "สร้าง", "Trie", "ลน")))
        self.assertIsNotNone(dict_trie(["ลอง", "สร้าง", "Trie", "ลน"]))
        self.assertIsNotNone(dict_trie({"ลอง", "สร้าง", "Trie", "ลน"}))
        self.assertIsNotNone(dict_trie(thai_words()))
        self.assertIsNotNone(
            dict_trie(os.path.join(_CORPUS_PATH, _THAI_WORDS_FILENAME)))
        with self.assertRaises(TypeError):
            dict_trie("")
        with self.assertRaises(TypeError):
            dict_trie(None)
        with self.assertRaises(TypeError):
            dict_trie(42)

Beispiel #4

Datei anzeigen

def _onecut(text: str, custom_dict: Trie) -> Generator[str, None, None]:
    # main data structure:
    # - key is begin position (int)
    # - value is possible end positions (List[int])
    # if key is not found, value is empty list
    graph = defaultdict(list)

    graph_size = 0  # keep track of graph size, if too big will force cutoff

    valid_poss = tcc_pos(text)  # breaking positions that are TCC-valid

    len_text = len(text)
    pos_list = [0]  # priority queue of possible breaking positions
    end_pos = 0
    while pos_list[0] < len_text:
        begin_pos = heappop(pos_list)
        for word in custom_dict.prefixes(text[begin_pos:]):
            end_pos_candidate = begin_pos + len(word)
            if end_pos_candidate in valid_poss:
                graph[begin_pos].append(end_pos_candidate)
                graph_size = graph_size + 1

                if end_pos_candidate not in pos_list:
                    heappush(pos_list, end_pos_candidate)

                if graph_size > _MAX_GRAPH_SIZE:
                    break

        len_pos_list = len(pos_list)
        if len_pos_list == 1:  # one candidate, no longer ambiguous
            end_pos_candidates = next(
                _bfs_paths_graph(graph, end_pos, pos_list[0]))
            graph_size = 0
            for pos in end_pos_candidates[1:]:
                yield text[end_pos:pos]
                end_pos = pos
        elif len_pos_list == 0:  # no candidate, deal with non-dictionary word
            m = _PAT_NONTHAI.match(text[begin_pos:])
            if m:  # non-Thai token, skip to the end
                end_pos = begin_pos + m.end()
            else:  # Thai token, find minimum skip
                for pos in range(begin_pos + 1, len_text):
                    if pos in valid_poss:
                        prefix = text[pos:]
                        words = [
                            word for word in custom_dict.prefixes(prefix)
                            if ((pos + len(word) in valid_poss)
                                and not _PAT_THAI_TWOCHARS.match(word))
                        ]
                        if words:  # is a Thai token that longer than 2 chars
                            end_pos = pos
                            break

                        # is a non-Thai token
                        if _PAT_NONTHAI.match(prefix):
                            end_pos = pos
                            break
                else:
                    end_pos = len_text

            graph[begin_pos].append(end_pos)
            graph_size = graph_size + 1
            yield text[begin_pos:end_pos]
            heappush(pos_list, end_pos)