Python thai_wordsの例、pythainlp.corpus.thai_words Pythonの例

コード例 #1

0

ファイルを表示

ファイル: test_corpus.py プロジェクト: preenet/pythainlp

    def test_corpus(self):
        self.assertIsInstance(thai_negations(), frozenset)
        self.assertIsInstance(thai_stopwords(), frozenset)
        self.assertIsInstance(thai_syllables(), frozenset)
        self.assertIsInstance(thai_words(), frozenset)

        self.assertIsInstance(countries(), frozenset)
        self.assertIsInstance(provinces(), frozenset)
        self.assertIsInstance(thai_female_names(), frozenset)
        self.assertIsInstance(thai_male_names(), frozenset)

        self.assertEqual(get_corpus_db_detail("XXX"),
                         {})  # corpus does not exist
        self.assertTrue(download("test"))  # download the first time
        self.assertTrue(download(name="test", force=True))  # force download
        self.assertTrue(download(name="test"))  # try download existing
        self.assertFalse(download(name="test",
                                  url="wrongurl"))  # URL not exist
        self.assertFalse(
            download(name="XxxXXxxx817d37sf"))  # corpus name not exist
        self.assertIsNotNone(get_corpus_db_detail("test"))  # corpus exists
        self.assertTrue(remove("test"))  # remove existing
        self.assertFalse(remove("test"))  # remove non-existing
        self.assertTrue(download(name="test", version="0.1"))
        self.assertTrue(remove("test"))

コード例 #2

0

ファイルを表示

ファイル: test_util.py プロジェクト: Aattawut/textcls-flaskapi-02

    def test_trie(self):
        self.assertIsNotNone(Trie([]))
        self.assertIsNotNone(Trie(["ทดสอบ", "ทด", "ทอด", "ทอผ้า"]))
        self.assertIsNotNone(Trie({"ทอด", "ทอง", "ทาง"}))
        self.assertIsNotNone(Trie(("ทอด", "ทอง", "ทาง")))
        self.assertIsNotNone(Trie(Trie(["ทดสอบ", "ทดลอง"])))

        trie = Trie(["ทด", "ทดสอบ", "ทดลอง"])
        self.assertIn("ทด", trie)
        trie.add("ทบ")
        self.assertEqual(len(trie), 4)
        self.assertEqual(len(trie.prefixes("ทดสอบ")), 2)

        trie.remove("ทบ")
        trie.remove("ทด")
        self.assertEqual(len(trie), 2)

        trie = Trie([])
        self.assertEqual(len(trie), 0)
        trie.remove("หมด")
        self.assertEqual(len(trie), 0)

        self.assertIsNotNone(dict_trie(Trie(["ลอง", "ลาก"])))
        self.assertIsNotNone(dict_trie(("ลอง", "สร้าง", "Trie", "ลน")))
        self.assertIsNotNone(dict_trie(["ลอง", "สร้าง", "Trie", "ลน"]))
        self.assertIsNotNone(dict_trie({"ลอง", "สร้าง", "Trie", "ลน"}))
        self.assertIsNotNone(dict_trie(thai_words()))
        self.assertIsNotNone(
            dict_trie(os.path.join(_CORPUS_PATH, _THAI_WORDS_FILENAME)))
        with self.assertRaises(TypeError):
            dict_trie("")
        with self.assertRaises(TypeError):
            dict_trie(None)
        with self.assertRaises(TypeError):
            dict_trie(42)

コード例 #3

0

ファイルを表示

    def test_word_tokenize(self):
        self.assertEqual(word_tokenize(""), [])
        self.assertEqual(
            word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย"),
            ["ฉัน", "รัก", "ภาษาไทย", "เพราะ", "ฉัน", "เป็น", "คนไทย"],
        )

        self.assertIsNotNone(word_tokenize("หมอนทองตากลมหูว์MBK39", engine="newmm"))
        self.assertIsNotNone(word_tokenize("หมอนทองตากลมหูว์MBK39", engine="mm"))
        self.assertIsNotNone(word_tokenize("หมอนทองตากลมหูว์MBK39", engine="longest"))
        self.assertIsNotNone(word_tokenize("หมอนทองตากลมหูว์MBK39", engine="ulmfit"))
        self.assertIsNotNone(word_tokenize("หมอนทองตากลมหูว์MBK39", engine="icu"))
        self.assertIsNotNone(word_tokenize("หมอนทองตากลมหูว์MBK39", engine="deepcut"))
        self.assertIsNotNone(word_tokenize("หมอนทองตากลมหูว์MBK39", engine="XX"))

        self.assertIsNotNone(dict_trie(()))
        self.assertIsNotNone(dict_trie(("ทดสอบ", "สร้าง", "Trie")))
        self.assertIsNotNone(dict_trie(["ทดสอบ", "สร้าง", "Trie"]))
        self.assertIsNotNone(dict_trie(thai_words()))
        self.assertIsNotNone(dict_trie(FROZEN_DICT_TRIE))

        self.assertIsNotNone(word_tokenize("รถไฟฟ้าBTS", custom_dict=DEFAULT_DICT_TRIE))
        self.assertIsNotNone(
            word_tokenize("ทดสอบ", engine="deepcut", custom_dict=FROZEN_DICT_TRIE)
        )
        self.assertIsNotNone(
            word_tokenize("ทดสอบ", engine="XX", custom_dict=FROZEN_DICT_TRIE)
        )

コード例 #4

0

ファイルを表示

    def test_corpus(self):
        self.assertIsInstance(thai_negations(), frozenset)
        self.assertIsInstance(thai_stopwords(), frozenset)
        self.assertIsInstance(thai_syllables(), frozenset)
        self.assertIsInstance(thai_words(), frozenset)

        self.assertIsInstance(countries(), frozenset)
        self.assertIsInstance(provinces(), frozenset)
        self.assertIsInstance(provinces(details=True), list)
        self.assertEqual(len(provinces(details=False)),
                         len(provinces(details=True)))
        self.assertIsInstance(thai_family_names(), frozenset)
        self.assertIsInstance(list(thai_family_names())[0], str)
        self.assertIsInstance(thai_female_names(), frozenset)
        self.assertIsInstance(thai_male_names(), frozenset)

        self.assertIsInstance(
            get_corpus_db("https://example.com/XXXXXX0lkjasd/SXfmskdjKKXXX"),
            Response,
        )  # URL does not exist, should get 404 response
        self.assertIsNone(get_corpus_db("XXXlkja3sfdXX"))  # Invalid URL

        self.assertEqual(get_corpus_db_detail("XXXmx3KSXX"),
                         {})  # corpus does not exist
        self.assertEqual(get_corpus_db_detail("XXXmx3KSXX", version="0.2"),
                         {})  # corpus does not exist

        self.assertTrue(download("test"))  # download the first time
        self.assertTrue(download(name="test", force=True))  # force download
        self.assertTrue(download(name="test"))  # try download existing
        self.assertFalse(download(name="test",
                                  url="wrongurl"))  # URL not exist
        self.assertFalse(
            download(name="XxxXXxxx817d37sf"))  # corpus name not exist
        self.assertIsNotNone(get_corpus_db_detail("test"))  # corpus exists
        self.assertIsNotNone(get_corpus_path("test"))  # corpus exists
        self.assertTrue(remove("test"))  # remove existing
        self.assertFalse(remove("test"))  # remove non-existing
        self.assertIsNone(get_corpus_path("XXXkdjfBzc"))  # query non-existing
        self.assertFalse(download(name="test", version="0.0"))
        self.assertFalse(download(name="test", version="0.0.0"))
        self.assertFalse(download(name="test", version="0.0.1"))
        self.assertFalse(download(name="test", version="0.0.2"))
        self.assertFalse(download(name="test", version="0.0.3"))
        self.assertFalse(download(name="test", version="0.0.4"))
        self.assertIsNotNone(download(name="test", version="0.0.5"))
        self.assertTrue(download("test"))
        self.assertIsNotNone(remove("test"))  # remove existing
        self.assertIsNotNone(download(name="test", version="0.0.6"))
        self.assertIsNotNone(download(name="test", version="0.0.7"))
        self.assertIsNotNone(download(name="test", version="0.0.8"))
        self.assertIsNotNone(download(name="test", version="0.0.9"))
        self.assertIsNotNone(download(name="test", version="0.0.10"))
        with self.assertRaises(Exception) as context:
            self.assertIsNotNone(download(name="test", version="0.0.11"))
        self.assertTrue(
            "Hash does not match expected." in str(context.exception))
        self.assertIsNotNone(download(name="test", version="0.1"))
        self.assertIsNotNone(remove("test"))

コード例 #5

0

ファイルを表示

ファイル: __init__.py プロジェクト: xemoe/pythainlp

 def test_corpus(self):
     self.assertIsNotNone(countries())
     self.assertIsNotNone(provinces())
     self.assertIsNotNone(thai_negations())
     self.assertIsNotNone(thai_stopwords())
     self.assertIsNotNone(thai_syllables())
     self.assertIsNotNone(thai_words())
     download("test")
     self.assertIsNotNone(remove("test"))
     self.assertIsNotNone(remove("tnc_freq"))

コード例 #6

0

ファイルを表示

ファイル: __init__.py プロジェクト: wannaphongcom/pythainlp

 def test_corpus(self):
     self.assertIsNotNone(countries())
     self.assertIsNotNone(provinces())
     self.assertIsNotNone(thai_negations())
     self.assertIsNotNone(thai_stopwords())
     self.assertIsNotNone(thai_syllables())
     self.assertIsNotNone(thai_words())
     download("test")
     self.assertIsNotNone(remove("test"))
     self.assertIsNotNone(remove("tnc_freq"))

コード例 #7

0

ファイルを表示

ファイル: flask_custom_json.py プロジェクト: oosuan/wordcloud_py

def test():
    body = json.loads(request.get_data())
    text = body['text']
    try:
        custom_stopwords = body['custom_stopwords']
    except KeyError:
        custom_stopwords = [""]
    try:
        custom_dict = body['custom_dict']
    except KeyError:
        custom_dict = [""]
    #รับ input จาก user

    stop_words = list(thai_stopwords()) + list(STOPWORDS) + custom_stopwords
    map(lambda stop_words: stop_words.lower(), stop_words)
    #ส่วนนี้คือส่วนที่เราใส่คำที่ห้ามโชว์ขึ้นไปใน wordcloud

    pythainlp_words = thai_words()
    dictionary = list(pythainlp_words) + custom_dict
    #เพิ่มคำที่ไม่มีใน dict ของภาษาไทยหรือภาษาอังกฤษเข้าไปให้เป็นคำเช่นถ้าเรา input "ลุงตู่" จะออกมาเป็น "ลุง","ตู่" แต่ถ้าเราเพิ่ม dict เข้าไป output จะเป็น "ลุงตู่"

    tok = Tokenizer(dictionary)
    #ตั้งตัวแปรเพื่อแยกคำ

    text = tok.word_tokenize(text)
    text = ' '.join(text)
    text = text.lower()
    #ทำการแยกคำ

    wordcloud = WordCloud(stopwords=stop_words,
                          font_path='THSarabunNew.ttf',
                          min_word_length=2,
                          relative_scaling=1.0,
                          min_font_size=1,
                          background_color="black",
                          width=800,
                          height=600,
                          scale=10,
                          font_step=1,
                          collocations=False,
                          colormap="gist_ncar",
                          regexp=r"[\u0E00-\u0E7Fa-zA-Z']+",
                          margin=2).generate(text)
    #ทำการ generate wordcloud

    plt.figure(figsize=(16, 9))
    plt.imshow(wordcloud, cmap=plt.cm.gray, interpolation='bilinear')
    plt.axis("off")
    #ทำการวาง wordcloud

    wordcloud.to_file('wordcloud.png')
    gc.collect()
    #เซฟรูปลง server และคลีนแรม

    return send_file('wordcloud.png')

コード例 #8

0

ファイルを表示

ファイル: test_tokenize.py プロジェクト: veer66/pythainlp

    def test_word_tokenize(self):
        self.assertEqual(word_tokenize(""), [])
        self.assertEqual(
            word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย"),
            ["ฉัน", "รัก", "ภาษาไทย", "เพราะ", "ฉัน", "เป็น", "คนไทย"],
        )

        self.assertIsNotNone(
            word_tokenize("หมอนทองตากลมหูว์MBK39", engine="newmm")
        )
        self.assertIsNotNone(
            word_tokenize("หมอนทองตากลมหูว์MBK39", engine="mm")
        )
        self.assertIsNotNone(
            word_tokenize("หมอนทองตากลมหูว์MBK39", engine="longest")
        )
        self.assertIsNotNone(
            word_tokenize("หมอนทองตากลมหูว์MBK39", engine="icu")
        )
        self.assertIsNotNone(
            word_tokenize("หมอนทองตากลมหูว์MBK39", engine="deepcut")
        )
        self.assertIsNotNone(
            word_tokenize("หมอนทองตากลมหูว์MBK39", engine="attacut")
        )
        self.assertIsNotNone(
            word_tokenize("หมอนทองตากลมหูว์MBK39", engine="XX")
        )  # XX engine is not existed

        self.assertIsNotNone(dict_trie(()))
        self.assertIsNotNone(dict_trie(("ทดสอบ", "สร้าง", "Trie")))
        self.assertIsNotNone(dict_trie(["ทดสอบ", "สร้าง", "Trie"]))
        self.assertIsNotNone(dict_trie({"ทดสอบ", "สร้าง", "Trie"}))
        self.assertIsNotNone(dict_trie(thai_words()))
        self.assertIsNotNone(dict_trie(DEFAULT_DICT_TRIE))
        self.assertIsNotNone(
            dict_trie(os.path.join(_CORPUS_PATH, _THAI_WORDS_FILENAME))
        )

        self.assertTrue(
            "ไฟ" in word_tokenize("รถไฟฟ้า", custom_dict=dict_trie(["ไฟ"]))
        )

        # Commented out until this unittest bug get fixed:
        # https://bugs.python.org/issue29620
        # with self.assertWarns(DeprecationWarning):
        #     dict_word_tokenize("เลิกใช้แล้ว", custom_dict=DEFAULT_DICT_TRIE)
        self.assertEqual(
            word_tokenize("รถไฟฟ้า", custom_dict=dict_trie(["ไฟ"])),
            dict_word_tokenize("รถไฟฟ้า", custom_dict=dict_trie(["ไฟ"])),
        )

コード例 #9

0

ファイルを表示

ファイル: test_corpus.py プロジェクト: madmuv/pythainlp

 def test_corpus(self):
     self.assertIsNotNone(countries())
     self.assertIsNotNone(provinces())
     self.assertIsNotNone(thai_negations())
     self.assertIsNotNone(thai_stopwords())
     self.assertIsNotNone(thai_syllables())
     self.assertIsNotNone(thai_words())
     self.assertIsNotNone(thai_female_names())
     self.assertIsNotNone(thai_male_names())
     self.assertEqual(get_corpus_db_detail("XXX"), {})
     self.assertIsNone(download("test"))
     self.assertIsNone(download("test", force=True))
     self.assertIsNotNone(get_corpus_db_detail("test"))
     self.assertIsNotNone(remove("test"))
     self.assertFalse(remove("test"))

コード例 #10

0

ファイルを表示

    def __init__(self,
                 custom_dict: Union[str, Iterable] = None,
                 tokenize_engine: str = "newmm"):
        """
        Initialize tokenizer object

        :param str custom_dict: a file path or a list of vocaburaies to be used to create a trie (default - original lexitron)
        :param str tokenize_engine: choose between different options of engine to token (newmm, mm, longest)
        """
        self.__trie_dict = None
        self.word_engine = tokenize_engine
        if custom_dict:
            self.__trie_dict = dict_trie(custom_dict)
        else:
            self.__trie_dict = dict_trie(thai_words())

コード例 #11

0

ファイルを表示

def revise_newmm_default_wordset(
    training_data: Iterable[Iterable[str]], ) -> Set[str]:
    """
    Revise a set of word that could improve tokenization performance of
    `pythainlp.tokenize.newmm`, a dictionary-based tokenizer and a default
    tokenizer for PyThaiNLP.

    Words from `pythainlp.corpus.thai_words()` will be used as a base set
    for the dictionary. Words that do not performed well with `training_data`
    will be removed. The remaining words will be returned.

    :param Iterable[Iterable[str]] training_data: tokenized text, to be used\
        as a training set
    :return: words that considered making `tokenize` perform unwell
    :rtype: Set[str]
    """
    orig_words = thai_words()
    trie = Trie(orig_words)

    def tokenize(text):
        return newmm.segment(text, custom_dict=trie)

    revised_words = revise_wordset(tokenize, orig_words, training_data)
    return revised_words

コード例 #12

0

ファイルを表示

]
ADDITIONAL_SPECIAL_TOKENS_EXCLUDE_SPACE_TOKEN = \
    [e for e in ADDITIONAL_SPECIAL_TOKENS if e != SPACE_TOKEN]
SET_ADDITIONAL_SPECIAL_TOKENS = frozenset(ADDITIONAL_SPECIAL_TOKENS)

PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "th-roberta-base": 514,
}

# Store pre tokenizer function (text cutter)
PRE_TOKENIZERS_MAP = {
    'newmm':
    partial(word_tokenize,
            custom_dict=Trie(
                frozenset(
                    set(thai_words()).union(set(ADDITIONAL_SPECIAL_TOKENS))))),
    'syllable':
    partial(word_tokenize,
            custom_dict=Trie(
                frozenset(
                    set(thai_syllables()).union(
                        set(ADDITIONAL_SPECIAL_TOKENS))))),
}

_nb_cores = multiprocessing.cpu_count()


def split_additional_special_token(texts):
    """
    Split list of text by additional special exclude space token.

コード例 #13

0

ファイルを表示

_valid_tokens = (set(_digits.keys()) | set(_powers_of_10.keys())
                 | {"ล้าน", "ลบ"})
_tokenizer = Tokenizer(custom_dict=_valid_tokens)


def _check_is_thainum(word: str):
    for j in list(_digits.keys()):
        if j in word:
            return (True, 'num')
    for j in ["สิบ", "ร้อย", "พัน", "หมื่น", "แสน", "ล้าน", "จุด", "ลบ"]:
        if j in word:
            return (True, 'unit')
    return (False, None)


_dict_words = [i for i in list(thai_words()) if not _check_is_thainum(i)[0]]
_dict_words += list(_digits.keys())
_dict_words += ["สิบ", "ร้อย", "พัน", "หมื่น", "แสน", "ล้าน", "จุด"]

_tokenizer_thaiwords = Tokenizer(_dict_words)


def thaiword_to_num(word: str) -> int:
    """
    Converts the spelled-out numerals in Thai scripts into an actual integer.

    :param str word: Spelled-out numerals in Thai scripts
    :return: Corresponding integer value of the input
    :rtype: int

    :Example:

コード例 #14

0

ファイルを表示

ファイル: tokenizers.py プロジェクト: dvgamerr/thai2transformers

SPACE_TOKEN = "<_>"
DEPRECATED_SPACE_TOKEN = '<th_roberta_space_token>'
SEFR_SPLIT_TOKEN = '<|>'
ADDITIONAL_SPECIAL_TOKENS = ['<s>', '<pad>', '</s>', '<unk>', '<mask>', SPACE_TOKEN, '\n']
ADDITIONAL_SPECIAL_TOKENS_EXCLUDE_SPACE_TOKEN = \
    [e for e in ADDITIONAL_SPECIAL_TOKENS if e != SPACE_TOKEN]
SET_ADDITIONAL_SPECIAL_TOKENS = frozenset(ADDITIONAL_SPECIAL_TOKENS)

PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "th-roberta-base": 514,
}

# Store pre tokenizer function (text cutter)
PRE_TOKENIZERS_MAP = {'newmm': partial(
    word_tokenize,
    custom_dict=Trie(frozenset(set(thai_words()).union(set(ADDITIONAL_SPECIAL_TOKENS))))
    ),
                      'syllable': partial(
    word_tokenize,
    custom_dict=Trie(frozenset(set(thai_syllables()).union(set(ADDITIONAL_SPECIAL_TOKENS))))
    ),
    }

_nb_cores = multiprocessing.cpu_count()


def split_additional_special_token(texts):
    """
    Split list of text by additional special exclude space token.

    Args:

コード例 #15

0

ファイルを表示

ファイル: __init__.py プロジェクト: wannaphongcom/pythainlp

# -*- coding: utf-8 -*-
"""
Thai tokenizers
"""
import re
import sys
from typing import Iterable, List, Union

from pythainlp.corpus import get_corpus, thai_syllables, thai_words

from marisa_trie import Trie

DEFAULT_DICT_TRIE = Trie(thai_words())
FROZEN_DICT_TRIE = Trie(get_corpus("words_th_frozen_201810.txt"))


def word_tokenize(
    text: str, custom_dict: Trie = None, engine: str = "newmm", keep_whitespace: bool = True
) -> List[str]:
    """
    :param str text: text to be tokenized
    :param str engine: tokenizer to be used
    :param dict custom_dict: a dictionary trie
    :param bool keep_whitespace: True to keep whitespaces, a common mark for end of phrase in Thai
    :Parameters for engine:
        * newmm (default) - dictionary-based, Maximum Matching + Thai Character Cluster
        * longest - dictionary-based, Longest Matching
        * deepcut - wrapper for deepcut, language-model-based https://github.com/rkcosmos/deepcut
        * icu - wrapper for ICU (International Components for Unicode, using PyICU), dictionary-based
        * ulmfit - for thai2fit
        * a custom_dict can be provided for newmm, longest, and deepcut

コード例 #16

0

ファイルを表示

from pythainlp import Tokenizer
from wordcloud import WordCloud, STOPWORDS
from flask import Flask, request, send_file
import json
import gc
import os


def word_preparing(words):
    '''ทำ list ของ words ให้ unique และทำ word เป็น lower-case'''
    words = set(words)  # drop duplicate word, unique word
    return [word.lower() for word in words]  # to lower case


# get dictionary and stopword corpus
DEFAULT_DICT = list(thai_words())
DEFAULT_STOPWORLS = list(thai_stopwords()) + list(STOPWORDS)

# word preparing
DEFAULT_DICT = word_preparing(DEFAULT_DICT)
DEFAULT_STOPWORLS = word_preparing(DEFAULT_STOPWORLS)

IMAGE_FILE = "wordcloud.png"  # ชื่อไฟล์ที่จะเซฟรูป wordcloud

app = Flask(__name__)


@app.route("/wordcloud", methods=["POST"])
def gen_worldcloud():
    # get text, custom_stopwords and custom_dict
    body = json.loads(request.get_data())

コード例 #17

0

ファイルを表示

ファイル: gen.py プロジェクト: PyThaiNLP/Thai-Data-Privacy

# -*- coding: utf-8 -*-
import random
from pythainlp.corpus import thai_female_names, thai_male_names, thai_words
from faker import Faker
fake = Faker()

list_name = list(thai_female_names()) + list(thai_male_names())
list_domain_thai = [
    ".go.th", ".co.th", ".or.th", ".in.th", ".ac.th", ".net.th", ".mi.th",
    ".ไทย"
]
list_thai_word = [i for i in list(thai_words()) if ' ' not in i]


def gen_name(full_name: bool = False) -> str:
    name = random.choice(list_name)
    if full_name:
        name += " " + random.choice(list_name)
    return name


def gen_thai_phone_number(mobile: bool = True) -> str:
    num = "0"
    last_i = 7
    if mobile:
        last_i = 8
    num += str(random.randint(1, 9))
    for i in range(0, last_i):
        num += str(random.randint(0, 9))
    return num

コード例 #18

0

ファイルを表示

# -*- coding: utf-8 -*-
from pythainlp.tokenize import word_tokenize, dict_trie
from pythainlp.corpus import thai_stopwords, thai_words, tnc
from pythainlp.util import normalize
import data
stopwords = list(thai_stopwords())
thaiword = list(thai_words())
#tnc1=[word for word,i in tnc.word_freqs()]
thaiword.remove("กินข้าว")
datadict = dict_trie(
    list(set(data.ccc + thaiword + stopwords + data.conjunctions)))  #+tnc1)))


def wordcut(word):
    global datadict
    return word_tokenize(word, custom_dict=datadict)

コード例 #19

0

ファイルを表示

ファイル: __init__.py プロジェクト: veer66/pythainlp

# -*- coding: utf-8 -*-
"""
Thai tokenizers
"""
import re
import warnings
from typing import Iterable, List, Union

from pythainlp.corpus import thai_syllables, thai_words

from .trie import Trie

DEFAULT_DICT_TRIE = Trie(thai_words())


def word_tokenize(
    text: str,
    custom_dict: Trie = None,
    engine: str = "newmm",
    keep_whitespace: bool = True,
) -> List[str]:
    """
    This function tokenizes running text into words.

    :param str text: text to be tokenized
    :param str engine: name of the tokenizer to be used
    :param pythainlp.tokenize.Trie custom_dict: dictionary trie
    :param bool keep_whitespace: True to keep whitespaces, a common mark
                                 for end of phrase in Thai.
                                 Otherwise, whitespaces are omitted.
    :return: list of words

コード例 #20

0

ファイルを表示

ファイル: flask_text_input.py プロジェクト: oosuan/wordcloud_py

import codecs

from pythainlp import word_tokenize, Tokenizer
from pythainlp.corpus import thai_stopwords, thai_words
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
from flask import Flask, request, send_file, after_this_request, render_template, redirect
import numpy as np
import random
import matplotlib

stop_words = list(thai_stopwords()) + list(STOPWORDS) +\
             ["฿","ly","pic","co","th","https","com","youtu","http","www","twitter","html","bit"]
map(lambda stop_words: stop_words.lower(), stop_words)

pythainlp_words = thai_words()
custom_dict = [
    'โคโรนา', 'ลุงตู่', 'โควิด', 'โคโรน่า', 'เจลล้างมือ', 'ขบวนเสด็จ'
]
dictionary = list(pythainlp_words) + list(custom_dict)

tok = Tokenizer(dictionary)


class main_flask():
    app = Flask(__name__)

    @app.route('/', methods=['GET'])
    def upload_file():
        return render_template('upload_text_redirect.html')