def test_corpus(self): self.assertIsInstance(thai_negations(), frozenset) self.assertIsInstance(thai_stopwords(), frozenset) self.assertIsInstance(thai_syllables(), frozenset) self.assertIsInstance(thai_words(), frozenset) self.assertIsInstance(countries(), frozenset) self.assertIsInstance(provinces(), frozenset) self.assertIsInstance(thai_female_names(), frozenset) self.assertIsInstance(thai_male_names(), frozenset) self.assertEqual(get_corpus_db_detail("XXX"), {}) # corpus does not exist self.assertTrue(download("test")) # download the first time self.assertTrue(download(name="test", force=True)) # force download self.assertTrue(download(name="test")) # try download existing self.assertFalse(download(name="test", url="wrongurl")) # URL not exist self.assertFalse( download(name="XxxXXxxx817d37sf")) # corpus name not exist self.assertIsNotNone(get_corpus_db_detail("test")) # corpus exists self.assertTrue(remove("test")) # remove existing self.assertFalse(remove("test")) # remove non-existing self.assertTrue(download(name="test", version="0.1")) self.assertTrue(remove("test"))
def test_trie(self): self.assertIsNotNone(Trie([])) self.assertIsNotNone(Trie(["ทดสอบ", "ทด", "ทอด", "ทอผ้า"])) self.assertIsNotNone(Trie({"ทอด", "ทอง", "ทาง"})) self.assertIsNotNone(Trie(("ทอด", "ทอง", "ทาง"))) self.assertIsNotNone(Trie(Trie(["ทดสอบ", "ทดลอง"]))) trie = Trie(["ทด", "ทดสอบ", "ทดลอง"]) self.assertIn("ทด", trie) trie.add("ทบ") self.assertEqual(len(trie), 4) self.assertEqual(len(trie.prefixes("ทดสอบ")), 2) trie.remove("ทบ") trie.remove("ทด") self.assertEqual(len(trie), 2) trie = Trie([]) self.assertEqual(len(trie), 0) trie.remove("หมด") self.assertEqual(len(trie), 0) self.assertIsNotNone(dict_trie(Trie(["ลอง", "ลาก"]))) self.assertIsNotNone(dict_trie(("ลอง", "สร้าง", "Trie", "ลน"))) self.assertIsNotNone(dict_trie(["ลอง", "สร้าง", "Trie", "ลน"])) self.assertIsNotNone(dict_trie({"ลอง", "สร้าง", "Trie", "ลน"})) self.assertIsNotNone(dict_trie(thai_words())) self.assertIsNotNone( dict_trie(os.path.join(_CORPUS_PATH, _THAI_WORDS_FILENAME))) with self.assertRaises(TypeError): dict_trie("") with self.assertRaises(TypeError): dict_trie(None) with self.assertRaises(TypeError): dict_trie(42)
def test_word_tokenize(self): self.assertEqual(word_tokenize(""), []) self.assertEqual( word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย"), ["ฉัน", "รัก", "ภาษาไทย", "เพราะ", "ฉัน", "เป็น", "คนไทย"], ) self.assertIsNotNone(word_tokenize("หมอนทองตากลมหูว์MBK39", engine="newmm")) self.assertIsNotNone(word_tokenize("หมอนทองตากลมหูว์MBK39", engine="mm")) self.assertIsNotNone(word_tokenize("หมอนทองตากลมหูว์MBK39", engine="longest")) self.assertIsNotNone(word_tokenize("หมอนทองตากลมหูว์MBK39", engine="ulmfit")) self.assertIsNotNone(word_tokenize("หมอนทองตากลมหูว์MBK39", engine="icu")) self.assertIsNotNone(word_tokenize("หมอนทองตากลมหูว์MBK39", engine="deepcut")) self.assertIsNotNone(word_tokenize("หมอนทองตากลมหูว์MBK39", engine="XX")) self.assertIsNotNone(dict_trie(())) self.assertIsNotNone(dict_trie(("ทดสอบ", "สร้าง", "Trie"))) self.assertIsNotNone(dict_trie(["ทดสอบ", "สร้าง", "Trie"])) self.assertIsNotNone(dict_trie(thai_words())) self.assertIsNotNone(dict_trie(FROZEN_DICT_TRIE)) self.assertIsNotNone(word_tokenize("รถไฟฟ้าBTS", custom_dict=DEFAULT_DICT_TRIE)) self.assertIsNotNone( word_tokenize("ทดสอบ", engine="deepcut", custom_dict=FROZEN_DICT_TRIE) ) self.assertIsNotNone( word_tokenize("ทดสอบ", engine="XX", custom_dict=FROZEN_DICT_TRIE) )
def test_corpus(self): self.assertIsInstance(thai_negations(), frozenset) self.assertIsInstance(thai_stopwords(), frozenset) self.assertIsInstance(thai_syllables(), frozenset) self.assertIsInstance(thai_words(), frozenset) self.assertIsInstance(countries(), frozenset) self.assertIsInstance(provinces(), frozenset) self.assertIsInstance(provinces(details=True), list) self.assertEqual(len(provinces(details=False)), len(provinces(details=True))) self.assertIsInstance(thai_family_names(), frozenset) self.assertIsInstance(list(thai_family_names())[0], str) self.assertIsInstance(thai_female_names(), frozenset) self.assertIsInstance(thai_male_names(), frozenset) self.assertIsInstance( get_corpus_db("https://example.com/XXXXXX0lkjasd/SXfmskdjKKXXX"), Response, ) # URL does not exist, should get 404 response self.assertIsNone(get_corpus_db("XXXlkja3sfdXX")) # Invalid URL self.assertEqual(get_corpus_db_detail("XXXmx3KSXX"), {}) # corpus does not exist self.assertEqual(get_corpus_db_detail("XXXmx3KSXX", version="0.2"), {}) # corpus does not exist self.assertTrue(download("test")) # download the first time self.assertTrue(download(name="test", force=True)) # force download self.assertTrue(download(name="test")) # try download existing self.assertFalse(download(name="test", url="wrongurl")) # URL not exist self.assertFalse( download(name="XxxXXxxx817d37sf")) # corpus name not exist self.assertIsNotNone(get_corpus_db_detail("test")) # corpus exists self.assertIsNotNone(get_corpus_path("test")) # corpus exists self.assertTrue(remove("test")) # remove existing self.assertFalse(remove("test")) # remove non-existing self.assertIsNone(get_corpus_path("XXXkdjfBzc")) # query non-existing self.assertFalse(download(name="test", version="0.0")) self.assertFalse(download(name="test", version="0.0.0")) self.assertFalse(download(name="test", version="0.0.1")) self.assertFalse(download(name="test", version="0.0.2")) self.assertFalse(download(name="test", version="0.0.3")) self.assertFalse(download(name="test", version="0.0.4")) self.assertIsNotNone(download(name="test", version="0.0.5")) self.assertTrue(download("test")) self.assertIsNotNone(remove("test")) # remove existing self.assertIsNotNone(download(name="test", version="0.0.6")) self.assertIsNotNone(download(name="test", version="0.0.7")) self.assertIsNotNone(download(name="test", version="0.0.8")) self.assertIsNotNone(download(name="test", version="0.0.9")) self.assertIsNotNone(download(name="test", version="0.0.10")) with self.assertRaises(Exception) as context: self.assertIsNotNone(download(name="test", version="0.0.11")) self.assertTrue( "Hash does not match expected." in str(context.exception)) self.assertIsNotNone(download(name="test", version="0.1")) self.assertIsNotNone(remove("test"))
def test_corpus(self): self.assertIsNotNone(countries()) self.assertIsNotNone(provinces()) self.assertIsNotNone(thai_negations()) self.assertIsNotNone(thai_stopwords()) self.assertIsNotNone(thai_syllables()) self.assertIsNotNone(thai_words()) download("test") self.assertIsNotNone(remove("test")) self.assertIsNotNone(remove("tnc_freq"))
def test_corpus(self): self.assertIsNotNone(countries()) self.assertIsNotNone(provinces()) self.assertIsNotNone(thai_negations()) self.assertIsNotNone(thai_stopwords()) self.assertIsNotNone(thai_syllables()) self.assertIsNotNone(thai_words()) download("test") self.assertIsNotNone(remove("test")) self.assertIsNotNone(remove("tnc_freq"))
def test(): body = json.loads(request.get_data()) text = body['text'] try: custom_stopwords = body['custom_stopwords'] except KeyError: custom_stopwords = [""] try: custom_dict = body['custom_dict'] except KeyError: custom_dict = [""] #รับ input จาก user stop_words = list(thai_stopwords()) + list(STOPWORDS) + custom_stopwords map(lambda stop_words: stop_words.lower(), stop_words) #ส่วนนี้คือส่วนที่เราใส่คำที่ห้ามโชว์ขึ้นไปใน wordcloud pythainlp_words = thai_words() dictionary = list(pythainlp_words) + custom_dict #เพิ่มคำที่ไม่มีใน dict ของภาษาไทยหรือภาษาอังกฤษเข้าไปให้เป็นคำเช่นถ้าเรา input "ลุงตู่" จะออกมาเป็น "ลุง","ตู่" แต่ถ้าเราเพิ่ม dict เข้าไป output จะเป็น "ลุงตู่" tok = Tokenizer(dictionary) #ตั้งตัวแปรเพื่อแยกคำ text = tok.word_tokenize(text) text = ' '.join(text) text = text.lower() #ทำการแยกคำ wordcloud = WordCloud(stopwords=stop_words, font_path='THSarabunNew.ttf', min_word_length=2, relative_scaling=1.0, min_font_size=1, background_color="black", width=800, height=600, scale=10, font_step=1, collocations=False, colormap="gist_ncar", regexp=r"[\u0E00-\u0E7Fa-zA-Z']+", margin=2).generate(text) #ทำการ generate wordcloud plt.figure(figsize=(16, 9)) plt.imshow(wordcloud, cmap=plt.cm.gray, interpolation='bilinear') plt.axis("off") #ทำการวาง wordcloud wordcloud.to_file('wordcloud.png') gc.collect() #เซฟรูปลง server และคลีนแรม return send_file('wordcloud.png')
def test_word_tokenize(self): self.assertEqual(word_tokenize(""), []) self.assertEqual( word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย"), ["ฉัน", "รัก", "ภาษาไทย", "เพราะ", "ฉัน", "เป็น", "คนไทย"], ) self.assertIsNotNone( word_tokenize("หมอนทองตากลมหูว์MBK39", engine="newmm") ) self.assertIsNotNone( word_tokenize("หมอนทองตากลมหูว์MBK39", engine="mm") ) self.assertIsNotNone( word_tokenize("หมอนทองตากลมหูว์MBK39", engine="longest") ) self.assertIsNotNone( word_tokenize("หมอนทองตากลมหูว์MBK39", engine="icu") ) self.assertIsNotNone( word_tokenize("หมอนทองตากลมหูว์MBK39", engine="deepcut") ) self.assertIsNotNone( word_tokenize("หมอนทองตากลมหูว์MBK39", engine="attacut") ) self.assertIsNotNone( word_tokenize("หมอนทองตากลมหูว์MBK39", engine="XX") ) # XX engine is not existed self.assertIsNotNone(dict_trie(())) self.assertIsNotNone(dict_trie(("ทดสอบ", "สร้าง", "Trie"))) self.assertIsNotNone(dict_trie(["ทดสอบ", "สร้าง", "Trie"])) self.assertIsNotNone(dict_trie({"ทดสอบ", "สร้าง", "Trie"})) self.assertIsNotNone(dict_trie(thai_words())) self.assertIsNotNone(dict_trie(DEFAULT_DICT_TRIE)) self.assertIsNotNone( dict_trie(os.path.join(_CORPUS_PATH, _THAI_WORDS_FILENAME)) ) self.assertTrue( "ไฟ" in word_tokenize("รถไฟฟ้า", custom_dict=dict_trie(["ไฟ"])) ) # Commented out until this unittest bug get fixed: # https://bugs.python.org/issue29620 # with self.assertWarns(DeprecationWarning): # dict_word_tokenize("เลิกใช้แล้ว", custom_dict=DEFAULT_DICT_TRIE) self.assertEqual( word_tokenize("รถไฟฟ้า", custom_dict=dict_trie(["ไฟ"])), dict_word_tokenize("รถไฟฟ้า", custom_dict=dict_trie(["ไฟ"])), )
def test_corpus(self): self.assertIsNotNone(countries()) self.assertIsNotNone(provinces()) self.assertIsNotNone(thai_negations()) self.assertIsNotNone(thai_stopwords()) self.assertIsNotNone(thai_syllables()) self.assertIsNotNone(thai_words()) self.assertIsNotNone(thai_female_names()) self.assertIsNotNone(thai_male_names()) self.assertEqual(get_corpus_db_detail("XXX"), {}) self.assertIsNone(download("test")) self.assertIsNone(download("test", force=True)) self.assertIsNotNone(get_corpus_db_detail("test")) self.assertIsNotNone(remove("test")) self.assertFalse(remove("test"))
def __init__(self, custom_dict: Union[str, Iterable] = None, tokenize_engine: str = "newmm"): """ Initialize tokenizer object :param str custom_dict: a file path or a list of vocaburaies to be used to create a trie (default - original lexitron) :param str tokenize_engine: choose between different options of engine to token (newmm, mm, longest) """ self.__trie_dict = None self.word_engine = tokenize_engine if custom_dict: self.__trie_dict = dict_trie(custom_dict) else: self.__trie_dict = dict_trie(thai_words())
def revise_newmm_default_wordset( training_data: Iterable[Iterable[str]], ) -> Set[str]: """ Revise a set of word that could improve tokenization performance of `pythainlp.tokenize.newmm`, a dictionary-based tokenizer and a default tokenizer for PyThaiNLP. Words from `pythainlp.corpus.thai_words()` will be used as a base set for the dictionary. Words that do not performed well with `training_data` will be removed. The remaining words will be returned. :param Iterable[Iterable[str]] training_data: tokenized text, to be used\ as a training set :return: words that considered making `tokenize` perform unwell :rtype: Set[str] """ orig_words = thai_words() trie = Trie(orig_words) def tokenize(text): return newmm.segment(text, custom_dict=trie) revised_words = revise_wordset(tokenize, orig_words, training_data) return revised_words
] ADDITIONAL_SPECIAL_TOKENS_EXCLUDE_SPACE_TOKEN = \ [e for e in ADDITIONAL_SPECIAL_TOKENS if e != SPACE_TOKEN] SET_ADDITIONAL_SPECIAL_TOKENS = frozenset(ADDITIONAL_SPECIAL_TOKENS) PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { "th-roberta-base": 514, } # Store pre tokenizer function (text cutter) PRE_TOKENIZERS_MAP = { 'newmm': partial(word_tokenize, custom_dict=Trie( frozenset( set(thai_words()).union(set(ADDITIONAL_SPECIAL_TOKENS))))), 'syllable': partial(word_tokenize, custom_dict=Trie( frozenset( set(thai_syllables()).union( set(ADDITIONAL_SPECIAL_TOKENS))))), } _nb_cores = multiprocessing.cpu_count() def split_additional_special_token(texts): """ Split list of text by additional special exclude space token.
_valid_tokens = (set(_digits.keys()) | set(_powers_of_10.keys()) | {"ล้าน", "ลบ"}) _tokenizer = Tokenizer(custom_dict=_valid_tokens) def _check_is_thainum(word: str): for j in list(_digits.keys()): if j in word: return (True, 'num') for j in ["สิบ", "ร้อย", "พัน", "หมื่น", "แสน", "ล้าน", "จุด", "ลบ"]: if j in word: return (True, 'unit') return (False, None) _dict_words = [i for i in list(thai_words()) if not _check_is_thainum(i)[0]] _dict_words += list(_digits.keys()) _dict_words += ["สิบ", "ร้อย", "พัน", "หมื่น", "แสน", "ล้าน", "จุด"] _tokenizer_thaiwords = Tokenizer(_dict_words) def thaiword_to_num(word: str) -> int: """ Converts the spelled-out numerals in Thai scripts into an actual integer. :param str word: Spelled-out numerals in Thai scripts :return: Corresponding integer value of the input :rtype: int :Example:
SPACE_TOKEN = "<_>" DEPRECATED_SPACE_TOKEN = '<th_roberta_space_token>' SEFR_SPLIT_TOKEN = '<|>' ADDITIONAL_SPECIAL_TOKENS = ['<s>', '<pad>', '</s>', '<unk>', '<mask>', SPACE_TOKEN, '\n'] ADDITIONAL_SPECIAL_TOKENS_EXCLUDE_SPACE_TOKEN = \ [e for e in ADDITIONAL_SPECIAL_TOKENS if e != SPACE_TOKEN] SET_ADDITIONAL_SPECIAL_TOKENS = frozenset(ADDITIONAL_SPECIAL_TOKENS) PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { "th-roberta-base": 514, } # Store pre tokenizer function (text cutter) PRE_TOKENIZERS_MAP = {'newmm': partial( word_tokenize, custom_dict=Trie(frozenset(set(thai_words()).union(set(ADDITIONAL_SPECIAL_TOKENS)))) ), 'syllable': partial( word_tokenize, custom_dict=Trie(frozenset(set(thai_syllables()).union(set(ADDITIONAL_SPECIAL_TOKENS)))) ), } _nb_cores = multiprocessing.cpu_count() def split_additional_special_token(texts): """ Split list of text by additional special exclude space token. Args:
# -*- coding: utf-8 -*- """ Thai tokenizers """ import re import sys from typing import Iterable, List, Union from pythainlp.corpus import get_corpus, thai_syllables, thai_words from marisa_trie import Trie DEFAULT_DICT_TRIE = Trie(thai_words()) FROZEN_DICT_TRIE = Trie(get_corpus("words_th_frozen_201810.txt")) def word_tokenize( text: str, custom_dict: Trie = None, engine: str = "newmm", keep_whitespace: bool = True ) -> List[str]: """ :param str text: text to be tokenized :param str engine: tokenizer to be used :param dict custom_dict: a dictionary trie :param bool keep_whitespace: True to keep whitespaces, a common mark for end of phrase in Thai :Parameters for engine: * newmm (default) - dictionary-based, Maximum Matching + Thai Character Cluster * longest - dictionary-based, Longest Matching * deepcut - wrapper for deepcut, language-model-based https://github.com/rkcosmos/deepcut * icu - wrapper for ICU (International Components for Unicode, using PyICU), dictionary-based * ulmfit - for thai2fit * a custom_dict can be provided for newmm, longest, and deepcut
from pythainlp import Tokenizer from wordcloud import WordCloud, STOPWORDS from flask import Flask, request, send_file import json import gc import os def word_preparing(words): '''ทำ list ของ words ให้ unique และทำ word เป็น lower-case''' words = set(words) # drop duplicate word, unique word return [word.lower() for word in words] # to lower case # get dictionary and stopword corpus DEFAULT_DICT = list(thai_words()) DEFAULT_STOPWORLS = list(thai_stopwords()) + list(STOPWORDS) # word preparing DEFAULT_DICT = word_preparing(DEFAULT_DICT) DEFAULT_STOPWORLS = word_preparing(DEFAULT_STOPWORLS) IMAGE_FILE = "wordcloud.png" # ชื่อไฟล์ที่จะเซฟรูป wordcloud app = Flask(__name__) @app.route("/wordcloud", methods=["POST"]) def gen_worldcloud(): # get text, custom_stopwords and custom_dict body = json.loads(request.get_data())
# -*- coding: utf-8 -*- import random from pythainlp.corpus import thai_female_names, thai_male_names, thai_words from faker import Faker fake = Faker() list_name = list(thai_female_names()) + list(thai_male_names()) list_domain_thai = [ ".go.th", ".co.th", ".or.th", ".in.th", ".ac.th", ".net.th", ".mi.th", ".ไทย" ] list_thai_word = [i for i in list(thai_words()) if ' ' not in i] def gen_name(full_name: bool = False) -> str: name = random.choice(list_name) if full_name: name += " " + random.choice(list_name) return name def gen_thai_phone_number(mobile: bool = True) -> str: num = "0" last_i = 7 if mobile: last_i = 8 num += str(random.randint(1, 9)) for i in range(0, last_i): num += str(random.randint(0, 9)) return num
# -*- coding: utf-8 -*- from pythainlp.tokenize import word_tokenize, dict_trie from pythainlp.corpus import thai_stopwords, thai_words, tnc from pythainlp.util import normalize import data stopwords = list(thai_stopwords()) thaiword = list(thai_words()) #tnc1=[word for word,i in tnc.word_freqs()] thaiword.remove("กินข้าว") datadict = dict_trie( list(set(data.ccc + thaiword + stopwords + data.conjunctions))) #+tnc1))) def wordcut(word): global datadict return word_tokenize(word, custom_dict=datadict)
# -*- coding: utf-8 -*- """ Thai tokenizers """ import re import warnings from typing import Iterable, List, Union from pythainlp.corpus import thai_syllables, thai_words from .trie import Trie DEFAULT_DICT_TRIE = Trie(thai_words()) def word_tokenize( text: str, custom_dict: Trie = None, engine: str = "newmm", keep_whitespace: bool = True, ) -> List[str]: """ This function tokenizes running text into words. :param str text: text to be tokenized :param str engine: name of the tokenizer to be used :param pythainlp.tokenize.Trie custom_dict: dictionary trie :param bool keep_whitespace: True to keep whitespaces, a common mark for end of phrase in Thai. Otherwise, whitespaces are omitted. :return: list of words
import codecs from pythainlp import word_tokenize, Tokenizer from pythainlp.corpus import thai_stopwords, thai_words import matplotlib.pyplot as plt from wordcloud import WordCloud, STOPWORDS from flask import Flask, request, send_file, after_this_request, render_template, redirect import numpy as np import random import matplotlib stop_words = list(thai_stopwords()) + list(STOPWORDS) +\ ["฿","ly","pic","co","th","https","com","youtu","http","www","twitter","html","bit"] map(lambda stop_words: stop_words.lower(), stop_words) pythainlp_words = thai_words() custom_dict = [ 'โคโรนา', 'ลุงตู่', 'โควิด', 'โคโรน่า', 'เจลล้างมือ', 'ขบวนเสด็จ' ] dictionary = list(pythainlp_words) + list(custom_dict) tok = Tokenizer(dictionary) class main_flask(): app = Flask(__name__) @app.route('/', methods=['GET']) def upload_file(): return render_template('upload_text_redirect.html')