Beispiel #1
0
def syllable_tokenize(text):
    """
    :param str text: input string to be tokenized

    :return: returns list of strings of syllables
    """
    syllables = []
    if text:
        words = word_tokenize(text)
        trie = create_custom_dict_trie(custom_dict_source=syllable_dict())
        for word in words:
            syllables.extend(dict_word_tokenize(text=word, custom_dict_trie=trie))

    return syllables
Beispiel #2
0
from pythainlp.tokenize import syllable_tokenize as word_tokenize
import sklearn_crfsuite
from pythainlp.spell.pn import NorvigSpellChecker
try:
    from pythainlp.corpus.thaisyllable import get_data as syllable_dict
    from pythainlp.corpus import stopwords
    stopwords = stopwords.words('thai')
except:
    from pythainlp.corpus.common import thai_syllables, thai_stopwords
    stopwords = list(thai_stopwords())
    syllable_dict = thai_syllables

templates_file = os.path.join(os.path.dirname(pythaispell.__file__),
                              "sp.model")
invalidChars = set(string.punctuation.replace("_", ""))
dict_s = list(set(syllable_dict()))


def c(word):
    for i in list('กขฃคฆงจชซญฎฏฐฑฒณดตถทธนบปพฟภมยรลวศษสฬอ'):
        if i in word:
            return True
    return False


def n(word):
    for i in list('ฅฉผฟฌหฮ'):
        if i in word:
            return True
    return False