def test(): body = json.loads(request.get_data()) text = body['text'] try: custom_stopwords = body['custom_stopwords'] except KeyError: custom_stopwords = [""] try: custom_dict = body['custom_dict'] except KeyError: custom_dict = [""] #รับ input จาก user stop_words = list(thai_stopwords()) + list(STOPWORDS) + custom_stopwords map(lambda stop_words: stop_words.lower(), stop_words) #ส่วนนี้คือส่วนที่เราใส่คำที่ห้ามโชว์ขึ้นไปใน wordcloud pythainlp_words = thai_words() dictionary = list(pythainlp_words) + custom_dict #เพิ่มคำที่ไม่มีใน dict ของภาษาไทยหรือภาษาอังกฤษเข้าไปให้เป็นคำเช่นถ้าเรา input "ลุงตู่" จะออกมาเป็น "ลุง","ตู่" แต่ถ้าเราเพิ่ม dict เข้าไป output จะเป็น "ลุงตู่" tok = Tokenizer(dictionary) #ตั้งตัวแปรเพื่อแยกคำ text = tok.word_tokenize(text) text = ' '.join(text) text = text.lower() #ทำการแยกคำ wordcloud = WordCloud(stopwords=stop_words, font_path='THSarabunNew.ttf', min_word_length=2, relative_scaling=1.0, min_font_size=1, background_color="black", width=800, height=600, scale=10, font_step=1, collocations=False, colormap="gist_ncar", regexp=r"[\u0E00-\u0E7Fa-zA-Z']+", margin=2).generate(text) #ทำการ generate wordcloud plt.figure(figsize=(16, 9)) plt.imshow(wordcloud, cmap=plt.cm.gray, interpolation='bilinear') plt.axis("off") #ทำการวาง wordcloud wordcloud.to_file('wordcloud.png') gc.collect() #เซฟรูปลง server และคลีนแรม return send_file('wordcloud.png')
def gen_worldcloud(): # get text, custom_stopwords and custom_dict body = json.loads(request.get_data()) text = body['text'] custom_stopwords = body[ 'custom_stopwords'] if 'custom_stopwords' in body else [] custom_dict = body['custom_dict'] if 'custom_dict' in body else [] # เพิ่มคำที่ไม่มีใน dict ของภาษาไทยหรือภาษาอังกฤษเข้าไปให้เป็นคำเช่นถ้าเรา input "ลุงตู่" # จะออกมาเป็น "ลุง","ตู่" แต่ถ้าเราเพิ่ม dict เข้าไป output จะเป็น "ลุงตู่" stop_words = DEFAULT_STOPWORLS + custom_stopwords dictionary = DEFAULT_DICT + custom_dict # word preparing stop_words = word_preparing(stop_words) dictionary = word_preparing(dictionary) # ทำการแยกคำ/ตัดคำ tok = Tokenizer(dictionary) tokens = tok.word_tokenize(text) text = ' '.join(tokens) # convert tokens เป็น string text = text.lower() # ทำเป็น lower-case # ทำการ generate wordcloud wordcloud = WordCloud(stopwords=stop_words, font_path='THSarabunNew.ttf', min_word_length=2, relative_scaling=1.0, min_font_size=1, background_color="black", width=800, height=600, scale=10, font_step=1, collocations=False, colormap="gist_ncar", regexp=r"[\u0E00-\u0E7Fa-zA-Z']+", margin=2).generate(text) # เซฟรูปและคลีนแรม wordcloud.to_file(IMAGE_FILE) gc.collect() # response รูป return send_file(IMAGE_FILE)
def ProcessText(self, text): dataBase = database() streetDf, addressDf = dataBase.ReadStreetName() streetList = dataBase.DataframeToList(streetDf) addressList = dataBase.DataframeToList(addressDf) districtList = dataBase.districtName_ wordList = districtList + streetList + addressList custom_words_list = set(thai_words()) custom_words_list.update(wordList) custom_words_list.update(self.specWord) trie = dict_trie(dict_source=custom_words_list) custom_tokenizer = Tokenizer(custom_dict=trie, engine=self.engineSel) proc = custom_tokenizer.word_tokenize(text) cleanList_1 = [] cleanList = [] [ cleanList_1.append( i.translate(str.maketrans('', '', string.punctuation))) for i in proc ] [ cleanList.append(i.translate(str.maketrans('', '', '1234567890'))) for i in cleanList_1 ] procText = list(filter(lambda x: x != " ", proc)) procText = list(filter(lambda x: x != " ", procText)) procText = list(filter(lambda x: x != "", procText)) #procText = list(filter(lambda x: len(x)>2, procText)) joinText = ' '.join(procText) #print(joinText) return joinText
from pythainlp import word_tokenize, Tokenizer text = "กฎหมายแรงงานฉบับปรับปรุงใหม่ประกาศใช้แล้ว" ''' Options for engine newmm (default) - dictionary-based, Maximum Matching + Thai Character Cluster longest - dictionary-based, Longest Matching deepcut - wrapper for deepcut, language-model-based icu - wrapper for ICU (International Components for Unicode, using PyICU), dictionary-based ulmfit - for thai2fit see more: https://thainlp.org/pythainlp/docs/2.0/api/tokenize.html ''' print("newmm:", word_tokenize(text)) # default engine is "newmm" print("longest:", word_tokenize(text, engine="longest")) words = ["กฎ", "งาน"] custom_tokenizer = Tokenizer(words) print("custom:", custom_tokenizer.word_tokenize(text))
from flask import Flask, request, send_file, after_this_request, render_template, redirect import numpy as np import random import matplotlib stop_words = list(thai_stopwords()) + list(STOPWORDS) +\ ["฿","ly","pic","co","th","https","com","youtu","http","www","twitter","html","bit"] map(lambda stop_words: stop_words.lower(), stop_words) pythainlp_words = thai_words() custom_dict = [ 'โคโรนา', 'ลุงตู่', 'โควิด', 'โคโรน่า', 'เจลล้างมือ', 'ขบวนเสด็จ' ] dictionary = list(pythainlp_words) + list(custom_dict) tok = Tokenizer(dictionary) class main_flask(): app = Flask(__name__) @app.route('/', methods=['GET']) def upload_file(): return render_template('upload_text_redirect.html') @app.route("/test", methods=['POST']) def test(): texts = request.form['texts'] texts = tok.word_tokenize(texts) text2 = ' '.join(texts) text2 = text2.lower()
"ฟอกฟัน", "ฟอกสีฟัน", "ขูดหินปูน", "แอร์โฟลว์", "แอร์โฟล", "แอร์โฟ", "แอร์โฟร์", "โรคเหงือก", "เลเซอร์เหงือก", "เลเซอร์", "เลเซอเหงือก", "เลเสอเหงือก", "เลเสอร์เเหงือก", "เลเซอร์เหงือกชมพู", "เลเซอเหงือกชมพู", "เลเสอเหงือกชมพูู", "เลเสอร์เเหงือกชมพู", "ศัลยกรรมเหงือก", "ตกแต่งเหงือก", "ศัลย์เหงือก", "ผ่าเหงือก", "ตัดเหงือก", "ตกแต่ง", "ศัลยกรรม", "รักษารากฟัน", "อุดฟัน", "ถอนฟัน", "ผ่าฟันคุด", "ฟันคุด", "ถอนฟันคุด" ] custom_words_list = set(thai_words()) ## add multiple words custom_words_list.update(words) ## add word trie = dict_trie(dict_source=custom_words_list) custom_tokenizer = Tokenizer(custom_dict=trie, engine='newmm') # loop through each sentence in our intents patterns for intent in intents['intents']: tag = intent['tag'] # add to tag list tags.append(tag) for pattern in intent['patterns']: # tokenize each word in the sentence # w = tokenize(pattern) w = custom_tokenizer.word_tokenize(pattern) # add to our words list all_words.extend(w)
def __init__(self, word, weight): self.word = word # Type: String self.weight = weight # Type: Integer class TreeNode(): def __init__(self, data): self.data = data # Type: <Dynamic> self.children = list() # Type: List<Dynamic> THAI_WORDS = set(thai_words()) for i in open('requirement/data/custom_tokenizer.txt', encoding="utf-8"): THAI_WORDS.add(i.replace('\n', '').strip()) TOKENIZER = Tokenizer(THAI_WORDS) KEYWORDS_HIGH_PRIORITY = [ Keyword(i[0], int(i[1])) for i in array( read_csv('requirement/data/keywords/priority-high.csv')).tolist() ] KEYWORDS_MEDIUM_PRIORITY = [ Keyword(i[0], int(i[1])) for i in array( read_csv('requirement/data/keywords/priority-medium.csv')).tolist() ] KEYWORDS_LOW_PRIORITY = [ Keyword(i[0], int(i[1])) for i in array( read_csv('requirement/data/keywords/priority-low.csv')).tolist() ]
def Processing(E1): p_stemmer = PorterStemmer() ThaiWord = list(thaisw.words('thai')) #print(' Thaiwords : ', ThaiWord) EngWord = list(set(engsw.words('english'))) #print(' ew : ',EngWord, ' : ', type(EngWord)) Morewords = [ u'การ', u'การทำงาน', u'ทำงาน', u'เสมอ', u'krub', u'Test', u'nan', u' ', u'test', u'.', u',', u'ทำ', u'-', u'/' ] All_Stop_Word = ThaiWord + EngWord + Morewords #print(' ALL : ',All_Stop_Word) EntryList = [] for n in E1: # check=detect(n[0]) # th or en #print(' text : ', n[0], ' :: ',check) EntryList.append(n[0]) #print(' EntryList : ', EntryList) Outcome = [] for r in EntryList: Dummy = [] tokens = [] tokens = list(eng_tokens(r)) lowered = [t.lower() for t in tokens] #print(' Dummy : ',lowered) lowered = " ".join(lowered) #Dummy=list(thai_tokens(lowered, engine='newmm')) words = set(thai_words()) words.add(u'ไทยเบฟ') words.add(u'ผสานพลัง') words.add(u'โอกาส') words.add(u'ถังไม้โอ๊ค') custom_tokenizer = Tokenizer(words) Dummy = list(custom_tokenizer.word_tokenize(lowered)) #print(' Dummy 2 : ',Dummy) Outcome.append(Dummy) #print(' Outcome : ',Outcome, ' : ', len(Outcome)) NoStop = [] for n in Outcome: Dummy = [] Dummy = [word for word in n if word not in All_Stop_Word] NoStop.append(Dummy) print(' No stop : ', NoStop, ' len: ', len(NoStop)) Lemma = [] for n in NoStop: Dummy = [] Dummy = [p_stemmer.stem(word) for word in n] Lemma.append(Dummy) print(' Lemma : ', Lemma, ' len: ', len(Lemma)) ''' # Instantiate the WordNetLemmatizer wordnet_lemmatizer = WordNetLemmatizer() # Lemmatize all tokens into a new list: lemmatized Lemma=[] for n in NoStop: Dummy=[] Dummy = [wordnet_lemmatizer.lemmatize(t) for t in n] Lemma.append(Dummy) #print(' lemma : ', Lemma, ' :: ', type(Lemma)) ''' Lemma_temp = [] for n in Lemma: Dummy = [] for i in n: w_syn = wordnet.synsets(i) if (len(w_syn) > 0) and (len(w_syn[0].lemma_names('tha')) > 0): Dummy.append(w_syn[0].lemma_names('tha')[0]) else: Dummy.append(i) Lemma_temp.append(Dummy) Lemma = Lemma_temp Lemma_temp = [] for n in Lemma: Dummy = [] Dummy = [i for i in n if not i.isnumeric()] Lemma_temp.append(Dummy) Lemma = Lemma_temp Lemma_temp = [] for n in Lemma: Dummy = [] Dummy = [i for i in n if not ' ' in i] Lemma_temp.append(Dummy) Lemma = Lemma_temp #print(' lemma : ', Lemma, ' :: ', type(Lemma)) return Lemma
positive_vocab.append(line.rstrip()) with open("swear-words.txt", 'r', encoding="utf8") as f: for line in f: swear_words.append(line.rstrip()) with open("neu.txt", 'r', encoding="utf8") as f: for line in f: neutral_vocab.append(line.rstrip()) sentences = input("insert thai text ") pythainlp_words = thai_words() custom_dict = ["ลุงตู่"] dictionary = list(pythainlp_words) + custom_dict tok = Tokenizer(dictionary) tokens = [] tokens = tok.word_tokenize(sentences) tokens = ' '.join(tokens) tokens = [tokens] for sentence in tokens: pos = 0 neu = 0 neg = 0 pred = [] print(sentence) words = sentence.split(' ') for word in words: if word in positive_vocab: