Ejemplo n.º 1
0
def thai_negations() -> frozenset:
    """
    Return a frozenset of Thai negation words
    """
    global _THAI_NEGATIONS
    if not _THAI_NEGATIONS:
        _THAI_NEGATIONS = get_corpus(_THAI_NEGATIONS_FILENAME)

    return _THAI_NEGATIONS
Ejemplo n.º 2
0
def provinces() -> frozenset:
    """
    Return a frozenset of Thailand province names in Thai
    """
    global _THAI_THAILAND_PROVINCES
    if not _THAI_THAILAND_PROVINCES:
        _THAI_THAILAND_PROVINCES = get_corpus(_THAI_THAILAND_PROVINCES_FILENAME)

    return _THAI_THAILAND_PROVINCES
Ejemplo n.º 3
0
def thai_stopwords() -> frozenset:
    """
    Return a frozenset of Thai stopwords
    """
    global _THAI_STOPWORDS
    if not _THAI_STOPWORDS:
        _THAI_STOPWORDS = get_corpus(_THAI_STOPWORDS_FILENAME)

    return _THAI_STOPWORDS
Ejemplo n.º 4
0
def thai_syllables() -> frozenset:
    """
    Return a frozenset of Thai syllables
    """
    global _THAI_SYLLABLES
    if not _THAI_SYLLABLES:
        _THAI_SYLLABLES = get_corpus(_THAI_SYLLABLES_FILENAME)

    return _THAI_SYLLABLES
Ejemplo n.º 5
0
def thai_syllables() -> frozenset:
    """
    Return a frozenset of Thai syllables
    """
    global _THAI_SYLLABLES
    if not _THAI_SYLLABLES:
        _THAI_SYLLABLES = get_corpus(_THAI_SYLLABLES_FILENAME)

    return _THAI_SYLLABLES
Ejemplo n.º 6
0
def thai_stopwords() -> frozenset:
    """
    Return a frozenset of Thai stopwords
    """
    global _THAI_STOPWORDS
    if not _THAI_STOPWORDS:
        _THAI_STOPWORDS = get_corpus(_THAI_STOPWORDS_FILENAME)

    return _THAI_STOPWORDS
Ejemplo n.º 7
0
def countries() -> frozenset:
    """
    Return a frozenset of country names in Thai
    """
    global _THAI_COUNTRIES
    if not _THAI_COUNTRIES:
        _THAI_COUNTRIES = get_corpus(_THAI_COUNTRIES_FILENAME)

    return _THAI_COUNTRIES
Ejemplo n.º 8
0
def countries() -> frozenset:
    """
    Return a frozenset of country names in Thai
    """
    global _THAI_COUNTRIES
    if not _THAI_COUNTRIES:
        _THAI_COUNTRIES = get_corpus(_THAI_COUNTRIES_FILENAME)

    return _THAI_COUNTRIES
Ejemplo n.º 9
0
def thai_negations() -> frozenset:
    """
    Return a frozenset of Thai negation words
    """
    global _THAI_NEGATIONS
    if not _THAI_NEGATIONS:
        _THAI_NEGATIONS = get_corpus(_THAI_NEGATIONS_FILENAME)

    return _THAI_NEGATIONS
Ejemplo n.º 10
0
def provinces() -> frozenset:
    """
    Return a frozenset of Thailand province names in Thai
    """
    global _THAI_THAILAND_PROVINCES
    if not _THAI_THAILAND_PROVINCES:
        _THAI_THAILAND_PROVINCES = get_corpus(
            _THAI_THAILAND_PROVINCES_FILENAME)

    return _THAI_THAILAND_PROVINCES
Ejemplo n.º 11
0
def word_freqs():
    """
    Get word frequency from Thai National Corpus (TNC)
    """
    lines = list(get_corpus(_FILENAME))
    listword = []
    for line in lines:
        listindata = line.split("\t")
        listword.append((listindata[0], int(listindata[1])))

    return listword
Ejemplo n.º 12
0
def word_freqs():
    """
    Get word frequency from Thai Textbook Corpus (TTC)
    """
    lines = list(get_corpus(_FILENAME))
    listword = []
    for line in lines:
        listindata = line.split("\t")
        listword.append((listindata[0], int(listindata[1])))

    return listword
Ejemplo n.º 13
0
def word_freqs():
    """
    Get word frequency from Thai National Corpus (TNC)
    """
    lines = list(get_corpus("tnc_freq.txt"))
    listword = []
    for line in lines:
        listindata = line.split("	")
        listword.append((listindata[0], int(listindata[1])))

    return listword
Ejemplo n.º 14
0
def word_freqs() -> List[Tuple[str, int]]:
    """
    Get word frequency from Thai National Corpus (TNC)
    """
    lines = list(get_corpus(_FILENAME))
    word_freqs = []
    for line in lines:
        word_freq = line.split("\t")
        if len(word_freq) >= 2:
            word_freqs.append((word_freq[0], int(word_freq[1])))

    return word_freqs
Ejemplo n.º 15
0
def unigram_word_freqs() -> defaultdict:
    """
    Get unigram word frequency from Thai National Corpus (TNC)
    """
    lines = list(get_corpus(_FILENAME))
    _word_freqs = defaultdict(int)
    for i in lines:
        _temp = i.strip().split("	")
        if len(_temp) >= 2:
            _word_freqs[_temp[0]] = int(_temp[-1])

    return _word_freqs
Ejemplo n.º 16
0
def thai_words() -> frozenset:
    """
    Return a frozenset of Thai words such as "กติกา", "กดดัน", "พิษ",
    and "พิษภัย". \n(See: `dev/pythainlp/corpus/words_th.txt\
    <https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/words_th.txt>`_)

    :return: :class:`frozenset` containing words in Thai language.
    :rtype: :class:`frozenset`
    """
    global _THAI_WORDS
    if not _THAI_WORDS:
        _THAI_WORDS = get_corpus(_THAI_WORDS_FILENAME)

    return _THAI_WORDS
Ejemplo n.º 17
0
def thai_stopwords() -> frozenset:
    """
    Return a frozenset of Thai stopwords such as "มี", "ไป", "ไง", "ขณะ",
    "การ", and "ประการหนึ่ง". \n(See: `dev/pythainlp/corpus/stopwords_th.txt\
    <https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/stopwords_th.txt>`_)

    :return: :class:`frozenset` containing stopwords.
    :rtype: :class:`frozenset`
    """
    global _THAI_STOPWORDS
    if not _THAI_STOPWORDS:
        _THAI_STOPWORDS = get_corpus(_THAI_STOPWORDS_FILENAME)

    return _THAI_STOPWORDS
Ejemplo n.º 18
0
def thai_negations() -> frozenset:
    """
    Return a frozenset of Thai negation words including "ไม่" and "แต่".
    \n(See: `dev/pythainlp/corpus/negations_th.txt\
    <https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/negations_th.txt>`_)

    :return: :class:`frozenset` containing negations in Thai language.
    :rtype: :class:`frozenset`
    """
    global _THAI_NEGATIONS
    if not _THAI_NEGATIONS:
        _THAI_NEGATIONS = get_corpus(_THAI_NEGATIONS_FILENAME)

    return _THAI_NEGATIONS
Ejemplo n.º 19
0
def thai_male_names() -> frozenset:
    """
    Return a frozenset of Thai male names
    \n(See: `dev/pythainlp/corpus/person_names_male_th.txt\
    <https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/person_names_male_th.txt>`_)

    :return: :class:`frozenset` containing Thai male names.
    :rtype: :class:`frozenset`
    """
    global _THAI_MALE_NAMES
    if not _THAI_MALE_NAMES:
        _THAI_MALE_NAMES = get_corpus(_THAI_MALE_NAMES_FILENAME)

    return _THAI_MALE_NAMES
Ejemplo n.º 20
0
def thai_family_names() -> FrozenSet[str]:
    """
    Return a frozenset of Thai family names
    \n(See: `dev/pythainlp/corpus/family_names_th.txt\
    <https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/family_names_th.txt>`_)

    :return: :class:`frozenset` containing Thai family names.
    :rtype: :class:`frozenset`
    """
    global _THAI_FAMLIY_NAMES
    if not _THAI_FAMLIY_NAMES:
        _THAI_FAMLIY_NAMES = get_corpus(_THAI_FAMLIY_NAMES_FILENAME)

    return _THAI_FAMLIY_NAMES
Ejemplo n.º 21
0
def word_freqs() -> List[Tuple[str, int]]:
    """
    Get word frequency from Thai Textbook Corpus (TTC)
    \n(See: `dev/pythainlp/corpus/ttc_freq.txt\
    <https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/ttc_freq.txt>`_)
    """
    lines = list(get_corpus(_FILENAME))
    word_freqs = []
    for line in lines:
        word_freq = line.split("\t")
        if len(word_freq) >= 2:
            word_freqs.append((word_freq[0], int(word_freq[1])))

    return word_freqs
Ejemplo n.º 22
0
def thai_syllables() -> frozenset:
    """
    Return a frozenset of Thai syllables such as "กรอบ", "ก็", "๑", "โมบ",
    "โมน", "โม่ง", "กา", "ก่า", and, "ก้า".
    \n(See: `dev/pythainlp/corpus/syllables_th.txt\
    <https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/syllables_th.txt>`_)

    :return: :class:`frozenset` containing syllables in Thai language.
    :rtype: :class:`frozenset`
    """
    global _THAI_SYLLABLES
    if not _THAI_SYLLABLES:
        _THAI_SYLLABLES = get_corpus(_THAI_SYLLABLES_FILENAME)

    return _THAI_SYLLABLES
Ejemplo n.º 23
0
def countries() -> frozenset:
    """
    Return a frozenset of country names in Thai such as "แคนาดา", "โรมาเนีย",
    "แอลจีเรีย", and "ลาว".
    \n(See: `dev/pythainlp/corpus/countries_th.txt\
    <https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/countries_th.txt>`_)

    :return: :class:`frozenset` containing countries names in Thai
    :rtype: :class:`frozenset`
    """
    global _THAI_COUNTRIES
    if not _THAI_COUNTRIES:
        _THAI_COUNTRIES = get_corpus(_THAI_COUNTRIES_FILENAME)

    return _THAI_COUNTRIES
Ejemplo n.º 24
0
def provinces() -> frozenset:
    """
    Return a frozenset of Thailand province names in Thai such as "กระบี่",
    "กรุงเทพมหานคร", "กาญจนบุรี", and "อุบลราชธานี".
    \n(See: `dev/pythainlp/corpus/thailand_provinces_th.txt\
    <https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/thailand_provinces_th.txt>`_)

    :return: :class:`frozenset` containing province names of Thailand
    :rtype: :class:`frozenset`
    """
    global _THAI_THAILAND_PROVINCES
    if not _THAI_THAILAND_PROVINCES:
        _THAI_THAILAND_PROVINCES = get_corpus(
            _THAI_THAILAND_PROVINCES_FILENAME)

    return _THAI_THAILAND_PROVINCES
Ejemplo n.º 25
0
def word_freqs() -> List[Tuple[str, int]]:
    """
    Get word frequency from Thai National Corpus (TNC)
    \n(See: `dev/pythainlp/corpus/tnc_freq.txt\
    <https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/tnc_freq.txt>`_)

    Credit: Korakot Chaovavanich https://bit.ly/3wSkZsF
    """
    lines = list(get_corpus(_FILENAME))
    word_freqs = []
    for line in lines:
        word_freq = line.split("\t")
        if len(word_freq) >= 2:
            word_freqs.append((word_freq[0], int(word_freq[1])))

    return word_freqs
Ejemplo n.º 26
0
def provinces(details: bool = False) -> Union[FrozenSet[str], List[str]]:
    """
    Return a frozenset of Thailand province names in Thai such as "กระบี่",
    "กรุงเทพมหานคร", "กาญจนบุรี", and "อุบลราชธานี".
    \n(See: `dev/pythainlp/corpus/thailand_provinces_th.txt\
    <https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/thailand_provinces_th.txt>`_)

    :param bool details: return details of provinces or not

    :return: :class:`frozenset` containing province names of Thailand \
    (if details is False) or :class:`list` containing :class:`dict` of \
    province names and details such as \
    [{'name_th': 'นนทบุรี', 'abbr_th': 'นบ', 'name_en': 'Nonthaburi', \
    'abbr_en': 'NBI'}].
    :rtype: :class:`frozenset` or :class:`list`
    """
    global _THAI_THAILAND_PROVINCES, _THAI_THAILAND_PROVINCES_DETAILS

    if not _THAI_THAILAND_PROVINCES or not _THAI_THAILAND_PROVINCES_DETAILS:
        provs = set()
        prov_details = list()

        for line in get_corpus(_THAI_THAILAND_PROVINCES_FILENAME, as_is=True):
            p = line.split(",")

            prov = dict()
            prov["name_th"] = p[0]
            prov["abbr_th"] = p[1]
            prov["name_en"] = p[2]
            prov["abbr_en"] = p[3]

            provs.add(prov["name_th"])
            prov_details.append(prov)

        _THAI_THAILAND_PROVINCES = frozenset(provs)
        _THAI_THAILAND_PROVINCES_DETAILS = prov_details

    if details:
        return _THAI_THAILAND_PROVINCES_DETAILS

    return _THAI_THAILAND_PROVINCES
Ejemplo n.º 27
0
# -*- coding: utf-8 -*-
"""
Thai tokenizers
"""
import re
from typing import Iterable, List, Union

from pythainlp.corpus import get_corpus, thai_syllables, thai_words

from marisa_trie import Trie

DEFAULT_DICT_TRIE = Trie(thai_words())
FROZEN_DICT_TRIE = Trie(get_corpus("words_th_frozen_201810.txt"))


def word_tokenize(text: str,
                  engine: str = "newmm",
                  whitespaces: bool = True) -> List[str]:
    """
    :param str text: text to be tokenized
    :param str engine: tokenizer to be used
    :param bool whitespaces: True to output no whitespace, a common mark of end of phrase in Thai
    :Parameters for engine:
        * newmm (default) - dictionary-based, Maximum Matching + Thai Character Cluster
        * longest - dictionary-based, Longest Matching
        * icu - wrapper for ICU, dictionary-based
        * deepcut - wrapper for deepcut, language-model-based https://github.com/rkcosmos/deepcut
        * ulmfit - use newmm engine with a specific dictionary for use with thai2vec
    :return: list of words, tokenized from the text

    **Example**::
Ejemplo n.º 28
0
    "document_vector",
    "merge_wgts",
    "pre_rules_th",
    "post_rules_th",
    "pre_rules_th_sparse",
    "post_rules_th_sparse",
    "process_thai",
    "_THWIKI_LSTM",
]

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

_MODEL_NAME_LSTM = "wiki_lm_lstm"
_ITOS_NAME_LSTM = "wiki_itos_lstm"

_THAI2FIT_WORDS = get_corpus("words_th_thai2fit_201810.txt")
_pythainlp_tokenizer = Tokenizer(custom_dict=_THAI2FIT_WORDS, engine="newmm")


# Download pretrained models
def _get_path(fname: str) -> str:
    """
    :meth: download get path of file from pythainlp-corpus
    :param str fname: file name
    :return: path to downloaded file
    """
    path = get_corpus_path(fname)
    if not path:
        download(fname)
        path = get_corpus_path(fname)
    return path
Ejemplo n.º 29
0
:See Also:

Inrut, Jeeragone, Patiroop Yuanghirun, Sarayut Paludkong, Supot Nitsuwat, and
Para Limmaneepraserth. "Thai word segmentation using combination of forward
and backward longest matching techniques." In International Symposium on
Communications and Information Technology (ISCIT), pp. 37-40. 2001.
"""
import re
from typing import List

from pythainlp import thai_follow_vowels
from pythainlp.corpus import get_corpus
from pythainlp.tokenize import Tokenizer

_cut_etcc = Tokenizer(get_corpus("etcc.txt"), engine="longest")
_PAT_ENDING_CHAR = f"[{thai_follow_vowels}ๆฯ]"
_RE_ENDING_CHAR = re.compile(_PAT_ENDING_CHAR)


def _cut_subword(tokens: List[str]) -> List[str]:
    len_tokens = len(tokens)
    i = 0
    while True:
        if i == len_tokens:
            break
        if _RE_ENDING_CHAR.search(tokens[i]) and i > 0 and len(tokens[i]) == 1:
            tokens[i - 1] += tokens[i]
            del tokens[i]
            len_tokens -= 1
        i += 1
Ejemplo n.º 30
0
# -*- coding: utf-8 -*-
"""
Thai tokenizers
"""
import re
import sys
from typing import Iterable, List, Union

from pythainlp.corpus import get_corpus, thai_syllables, thai_words

from marisa_trie import Trie

DEFAULT_DICT_TRIE = Trie(thai_words())
FROZEN_DICT_TRIE = Trie(get_corpus("words_th_frozen_201810.txt"))


def word_tokenize(
    text: str, custom_dict: Trie = None, engine: str = "newmm", keep_whitespace: bool = True
) -> List[str]:
    """
    :param str text: text to be tokenized
    :param str engine: tokenizer to be used
    :param dict custom_dict: a dictionary trie
    :param bool keep_whitespace: True to keep whitespaces, a common mark for end of phrase in Thai
    :Parameters for engine:
        * newmm (default) - dictionary-based, Maximum Matching + Thai Character Cluster
        * longest - dictionary-based, Longest Matching
        * deepcut - wrapper for deepcut, language-model-based https://github.com/rkcosmos/deepcut
        * icu - wrapper for ICU (International Components for Unicode, using PyICU), dictionary-based
        * ulmfit - for thai2fit
        * a custom_dict can be provided for newmm, longest, and deepcut