def thai_negations() -> frozenset: """ Return a frozenset of Thai negation words """ global _THAI_NEGATIONS if not _THAI_NEGATIONS: _THAI_NEGATIONS = get_corpus(_THAI_NEGATIONS_FILENAME) return _THAI_NEGATIONS
def provinces() -> frozenset: """ Return a frozenset of Thailand province names in Thai """ global _THAI_THAILAND_PROVINCES if not _THAI_THAILAND_PROVINCES: _THAI_THAILAND_PROVINCES = get_corpus(_THAI_THAILAND_PROVINCES_FILENAME) return _THAI_THAILAND_PROVINCES
def thai_stopwords() -> frozenset: """ Return a frozenset of Thai stopwords """ global _THAI_STOPWORDS if not _THAI_STOPWORDS: _THAI_STOPWORDS = get_corpus(_THAI_STOPWORDS_FILENAME) return _THAI_STOPWORDS
def thai_syllables() -> frozenset: """ Return a frozenset of Thai syllables """ global _THAI_SYLLABLES if not _THAI_SYLLABLES: _THAI_SYLLABLES = get_corpus(_THAI_SYLLABLES_FILENAME) return _THAI_SYLLABLES
def countries() -> frozenset: """ Return a frozenset of country names in Thai """ global _THAI_COUNTRIES if not _THAI_COUNTRIES: _THAI_COUNTRIES = get_corpus(_THAI_COUNTRIES_FILENAME) return _THAI_COUNTRIES
def provinces() -> frozenset: """ Return a frozenset of Thailand province names in Thai """ global _THAI_THAILAND_PROVINCES if not _THAI_THAILAND_PROVINCES: _THAI_THAILAND_PROVINCES = get_corpus( _THAI_THAILAND_PROVINCES_FILENAME) return _THAI_THAILAND_PROVINCES
def word_freqs(): """ Get word frequency from Thai National Corpus (TNC) """ lines = list(get_corpus(_FILENAME)) listword = [] for line in lines: listindata = line.split("\t") listword.append((listindata[0], int(listindata[1]))) return listword
def word_freqs(): """ Get word frequency from Thai Textbook Corpus (TTC) """ lines = list(get_corpus(_FILENAME)) listword = [] for line in lines: listindata = line.split("\t") listword.append((listindata[0], int(listindata[1]))) return listword
def word_freqs(): """ Get word frequency from Thai National Corpus (TNC) """ lines = list(get_corpus("tnc_freq.txt")) listword = [] for line in lines: listindata = line.split(" ") listword.append((listindata[0], int(listindata[1]))) return listword
def word_freqs() -> List[Tuple[str, int]]: """ Get word frequency from Thai National Corpus (TNC) """ lines = list(get_corpus(_FILENAME)) word_freqs = [] for line in lines: word_freq = line.split("\t") if len(word_freq) >= 2: word_freqs.append((word_freq[0], int(word_freq[1]))) return word_freqs
def unigram_word_freqs() -> defaultdict: """ Get unigram word frequency from Thai National Corpus (TNC) """ lines = list(get_corpus(_FILENAME)) _word_freqs = defaultdict(int) for i in lines: _temp = i.strip().split(" ") if len(_temp) >= 2: _word_freqs[_temp[0]] = int(_temp[-1]) return _word_freqs
def thai_words() -> frozenset: """ Return a frozenset of Thai words such as "กติกา", "กดดัน", "พิษ", and "พิษภัย". \n(See: `dev/pythainlp/corpus/words_th.txt\ <https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/words_th.txt>`_) :return: :class:`frozenset` containing words in Thai language. :rtype: :class:`frozenset` """ global _THAI_WORDS if not _THAI_WORDS: _THAI_WORDS = get_corpus(_THAI_WORDS_FILENAME) return _THAI_WORDS
def thai_stopwords() -> frozenset: """ Return a frozenset of Thai stopwords such as "มี", "ไป", "ไง", "ขณะ", "การ", and "ประการหนึ่ง". \n(See: `dev/pythainlp/corpus/stopwords_th.txt\ <https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/stopwords_th.txt>`_) :return: :class:`frozenset` containing stopwords. :rtype: :class:`frozenset` """ global _THAI_STOPWORDS if not _THAI_STOPWORDS: _THAI_STOPWORDS = get_corpus(_THAI_STOPWORDS_FILENAME) return _THAI_STOPWORDS
def thai_negations() -> frozenset: """ Return a frozenset of Thai negation words including "ไม่" and "แต่". \n(See: `dev/pythainlp/corpus/negations_th.txt\ <https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/negations_th.txt>`_) :return: :class:`frozenset` containing negations in Thai language. :rtype: :class:`frozenset` """ global _THAI_NEGATIONS if not _THAI_NEGATIONS: _THAI_NEGATIONS = get_corpus(_THAI_NEGATIONS_FILENAME) return _THAI_NEGATIONS
def thai_male_names() -> frozenset: """ Return a frozenset of Thai male names \n(See: `dev/pythainlp/corpus/person_names_male_th.txt\ <https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/person_names_male_th.txt>`_) :return: :class:`frozenset` containing Thai male names. :rtype: :class:`frozenset` """ global _THAI_MALE_NAMES if not _THAI_MALE_NAMES: _THAI_MALE_NAMES = get_corpus(_THAI_MALE_NAMES_FILENAME) return _THAI_MALE_NAMES
def thai_family_names() -> FrozenSet[str]: """ Return a frozenset of Thai family names \n(See: `dev/pythainlp/corpus/family_names_th.txt\ <https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/family_names_th.txt>`_) :return: :class:`frozenset` containing Thai family names. :rtype: :class:`frozenset` """ global _THAI_FAMLIY_NAMES if not _THAI_FAMLIY_NAMES: _THAI_FAMLIY_NAMES = get_corpus(_THAI_FAMLIY_NAMES_FILENAME) return _THAI_FAMLIY_NAMES
def word_freqs() -> List[Tuple[str, int]]: """ Get word frequency from Thai Textbook Corpus (TTC) \n(See: `dev/pythainlp/corpus/ttc_freq.txt\ <https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/ttc_freq.txt>`_) """ lines = list(get_corpus(_FILENAME)) word_freqs = [] for line in lines: word_freq = line.split("\t") if len(word_freq) >= 2: word_freqs.append((word_freq[0], int(word_freq[1]))) return word_freqs
def thai_syllables() -> frozenset: """ Return a frozenset of Thai syllables such as "กรอบ", "ก็", "๑", "โมบ", "โมน", "โม่ง", "กา", "ก่า", and, "ก้า". \n(See: `dev/pythainlp/corpus/syllables_th.txt\ <https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/syllables_th.txt>`_) :return: :class:`frozenset` containing syllables in Thai language. :rtype: :class:`frozenset` """ global _THAI_SYLLABLES if not _THAI_SYLLABLES: _THAI_SYLLABLES = get_corpus(_THAI_SYLLABLES_FILENAME) return _THAI_SYLLABLES
def countries() -> frozenset: """ Return a frozenset of country names in Thai such as "แคนาดา", "โรมาเนีย", "แอลจีเรีย", and "ลาว". \n(See: `dev/pythainlp/corpus/countries_th.txt\ <https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/countries_th.txt>`_) :return: :class:`frozenset` containing countries names in Thai :rtype: :class:`frozenset` """ global _THAI_COUNTRIES if not _THAI_COUNTRIES: _THAI_COUNTRIES = get_corpus(_THAI_COUNTRIES_FILENAME) return _THAI_COUNTRIES
def provinces() -> frozenset: """ Return a frozenset of Thailand province names in Thai such as "กระบี่", "กรุงเทพมหานคร", "กาญจนบุรี", and "อุบลราชธานี". \n(See: `dev/pythainlp/corpus/thailand_provinces_th.txt\ <https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/thailand_provinces_th.txt>`_) :return: :class:`frozenset` containing province names of Thailand :rtype: :class:`frozenset` """ global _THAI_THAILAND_PROVINCES if not _THAI_THAILAND_PROVINCES: _THAI_THAILAND_PROVINCES = get_corpus( _THAI_THAILAND_PROVINCES_FILENAME) return _THAI_THAILAND_PROVINCES
def word_freqs() -> List[Tuple[str, int]]: """ Get word frequency from Thai National Corpus (TNC) \n(See: `dev/pythainlp/corpus/tnc_freq.txt\ <https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/tnc_freq.txt>`_) Credit: Korakot Chaovavanich https://bit.ly/3wSkZsF """ lines = list(get_corpus(_FILENAME)) word_freqs = [] for line in lines: word_freq = line.split("\t") if len(word_freq) >= 2: word_freqs.append((word_freq[0], int(word_freq[1]))) return word_freqs
def provinces(details: bool = False) -> Union[FrozenSet[str], List[str]]: """ Return a frozenset of Thailand province names in Thai such as "กระบี่", "กรุงเทพมหานคร", "กาญจนบุรี", and "อุบลราชธานี". \n(See: `dev/pythainlp/corpus/thailand_provinces_th.txt\ <https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/thailand_provinces_th.txt>`_) :param bool details: return details of provinces or not :return: :class:`frozenset` containing province names of Thailand \ (if details is False) or :class:`list` containing :class:`dict` of \ province names and details such as \ [{'name_th': 'นนทบุรี', 'abbr_th': 'นบ', 'name_en': 'Nonthaburi', \ 'abbr_en': 'NBI'}]. :rtype: :class:`frozenset` or :class:`list` """ global _THAI_THAILAND_PROVINCES, _THAI_THAILAND_PROVINCES_DETAILS if not _THAI_THAILAND_PROVINCES or not _THAI_THAILAND_PROVINCES_DETAILS: provs = set() prov_details = list() for line in get_corpus(_THAI_THAILAND_PROVINCES_FILENAME, as_is=True): p = line.split(",") prov = dict() prov["name_th"] = p[0] prov["abbr_th"] = p[1] prov["name_en"] = p[2] prov["abbr_en"] = p[3] provs.add(prov["name_th"]) prov_details.append(prov) _THAI_THAILAND_PROVINCES = frozenset(provs) _THAI_THAILAND_PROVINCES_DETAILS = prov_details if details: return _THAI_THAILAND_PROVINCES_DETAILS return _THAI_THAILAND_PROVINCES
# -*- coding: utf-8 -*- """ Thai tokenizers """ import re from typing import Iterable, List, Union from pythainlp.corpus import get_corpus, thai_syllables, thai_words from marisa_trie import Trie DEFAULT_DICT_TRIE = Trie(thai_words()) FROZEN_DICT_TRIE = Trie(get_corpus("words_th_frozen_201810.txt")) def word_tokenize(text: str, engine: str = "newmm", whitespaces: bool = True) -> List[str]: """ :param str text: text to be tokenized :param str engine: tokenizer to be used :param bool whitespaces: True to output no whitespace, a common mark of end of phrase in Thai :Parameters for engine: * newmm (default) - dictionary-based, Maximum Matching + Thai Character Cluster * longest - dictionary-based, Longest Matching * icu - wrapper for ICU, dictionary-based * deepcut - wrapper for deepcut, language-model-based https://github.com/rkcosmos/deepcut * ulmfit - use newmm engine with a specific dictionary for use with thai2vec :return: list of words, tokenized from the text **Example**::
"document_vector", "merge_wgts", "pre_rules_th", "post_rules_th", "pre_rules_th_sparse", "post_rules_th_sparse", "process_thai", "_THWIKI_LSTM", ] device = torch.device("cuda" if torch.cuda.is_available() else "cpu") _MODEL_NAME_LSTM = "wiki_lm_lstm" _ITOS_NAME_LSTM = "wiki_itos_lstm" _THAI2FIT_WORDS = get_corpus("words_th_thai2fit_201810.txt") _pythainlp_tokenizer = Tokenizer(custom_dict=_THAI2FIT_WORDS, engine="newmm") # Download pretrained models def _get_path(fname: str) -> str: """ :meth: download get path of file from pythainlp-corpus :param str fname: file name :return: path to downloaded file """ path = get_corpus_path(fname) if not path: download(fname) path = get_corpus_path(fname) return path
:See Also: Inrut, Jeeragone, Patiroop Yuanghirun, Sarayut Paludkong, Supot Nitsuwat, and Para Limmaneepraserth. "Thai word segmentation using combination of forward and backward longest matching techniques." In International Symposium on Communications and Information Technology (ISCIT), pp. 37-40. 2001. """ import re from typing import List from pythainlp import thai_follow_vowels from pythainlp.corpus import get_corpus from pythainlp.tokenize import Tokenizer _cut_etcc = Tokenizer(get_corpus("etcc.txt"), engine="longest") _PAT_ENDING_CHAR = f"[{thai_follow_vowels}ๆฯ]" _RE_ENDING_CHAR = re.compile(_PAT_ENDING_CHAR) def _cut_subword(tokens: List[str]) -> List[str]: len_tokens = len(tokens) i = 0 while True: if i == len_tokens: break if _RE_ENDING_CHAR.search(tokens[i]) and i > 0 and len(tokens[i]) == 1: tokens[i - 1] += tokens[i] del tokens[i] len_tokens -= 1 i += 1
# -*- coding: utf-8 -*- """ Thai tokenizers """ import re import sys from typing import Iterable, List, Union from pythainlp.corpus import get_corpus, thai_syllables, thai_words from marisa_trie import Trie DEFAULT_DICT_TRIE = Trie(thai_words()) FROZEN_DICT_TRIE = Trie(get_corpus("words_th_frozen_201810.txt")) def word_tokenize( text: str, custom_dict: Trie = None, engine: str = "newmm", keep_whitespace: bool = True ) -> List[str]: """ :param str text: text to be tokenized :param str engine: tokenizer to be used :param dict custom_dict: a dictionary trie :param bool keep_whitespace: True to keep whitespaces, a common mark for end of phrase in Thai :Parameters for engine: * newmm (default) - dictionary-based, Maximum Matching + Thai Character Cluster * longest - dictionary-based, Longest Matching * deepcut - wrapper for deepcut, language-model-based https://github.com/rkcosmos/deepcut * icu - wrapper for ICU (International Components for Unicode, using PyICU), dictionary-based * ulmfit - for thai2fit * a custom_dict can be provided for newmm, longest, and deepcut