def test_corpus(self): self.assertIsInstance(thai_negations(), frozenset) self.assertIsInstance(thai_stopwords(), frozenset) self.assertIsInstance(thai_syllables(), frozenset) self.assertIsInstance(thai_words(), frozenset) self.assertIsInstance(countries(), frozenset) self.assertIsInstance(provinces(), frozenset) self.assertIsInstance(thai_female_names(), frozenset) self.assertIsInstance(thai_male_names(), frozenset) self.assertEqual(get_corpus_db_detail("XXX"), {}) # corpus does not exist self.assertTrue(download("test")) # download the first time self.assertTrue(download(name="test", force=True)) # force download self.assertTrue(download(name="test")) # try download existing self.assertFalse(download(name="test", url="wrongurl")) # URL not exist self.assertFalse( download(name="XxxXXxxx817d37sf")) # corpus name not exist self.assertIsNotNone(get_corpus_db_detail("test")) # corpus exists self.assertTrue(remove("test")) # remove existing self.assertFalse(remove("test")) # remove non-existing self.assertTrue(download(name="test", version="0.1")) self.assertTrue(remove("test"))
def text_to_bow_stopword(tokenized_text, vocabulary_): """ฟังก์ชันเพื่อแปลงลิสต์ของ tokenized text เป็น sparse matrix""" n_doc = len(tokenized_text) values, row_indices, col_indices = [], [], [] stop_words = set(thai_stopwords()) for r, tokens in enumerate(tokenized_text): # print('r:', r) # print('tokens:', tokens) filtered_sentence = [w for w in tokens if not w in stop_words] # print(r,frequency_word(filtered_sentence)) # print('filtered_sentence',filtered_sentence) feature = {} for token in filtered_sentence: word_index = vocabulary_.get(token) if word_index is not None: if word_index not in feature.keys(): feature[word_index] = 1 else: feature[word_index] += 1 # print('r:',r) # print('feature:',feature) for c, v in feature.items(): values.append(v) row_indices.append(r) col_indices.append(c) # document-term matrix in sparse CSR format X = sp.csr_matrix((values, (row_indices, col_indices)), shape=(n_doc, len(vocabulary_))) return X
def test_corpus(self): self.assertIsInstance(thai_negations(), frozenset) self.assertIsInstance(thai_stopwords(), frozenset) self.assertIsInstance(thai_syllables(), frozenset) self.assertIsInstance(thai_words(), frozenset) self.assertIsInstance(countries(), frozenset) self.assertIsInstance(provinces(), frozenset) self.assertIsInstance(provinces(details=True), list) self.assertEqual(len(provinces(details=False)), len(provinces(details=True))) self.assertIsInstance(thai_family_names(), frozenset) self.assertIsInstance(list(thai_family_names())[0], str) self.assertIsInstance(thai_female_names(), frozenset) self.assertIsInstance(thai_male_names(), frozenset) self.assertIsInstance( get_corpus_db("https://example.com/XXXXXX0lkjasd/SXfmskdjKKXXX"), Response, ) # URL does not exist, should get 404 response self.assertIsNone(get_corpus_db("XXXlkja3sfdXX")) # Invalid URL self.assertEqual(get_corpus_db_detail("XXXmx3KSXX"), {}) # corpus does not exist self.assertEqual(get_corpus_db_detail("XXXmx3KSXX", version="0.2"), {}) # corpus does not exist self.assertTrue(download("test")) # download the first time self.assertTrue(download(name="test", force=True)) # force download self.assertTrue(download(name="test")) # try download existing self.assertFalse(download(name="test", url="wrongurl")) # URL not exist self.assertFalse( download(name="XxxXXxxx817d37sf")) # corpus name not exist self.assertIsNotNone(get_corpus_db_detail("test")) # corpus exists self.assertIsNotNone(get_corpus_path("test")) # corpus exists self.assertTrue(remove("test")) # remove existing self.assertFalse(remove("test")) # remove non-existing self.assertIsNone(get_corpus_path("XXXkdjfBzc")) # query non-existing self.assertFalse(download(name="test", version="0.0")) self.assertFalse(download(name="test", version="0.0.0")) self.assertFalse(download(name="test", version="0.0.1")) self.assertFalse(download(name="test", version="0.0.2")) self.assertFalse(download(name="test", version="0.0.3")) self.assertFalse(download(name="test", version="0.0.4")) self.assertIsNotNone(download(name="test", version="0.0.5")) self.assertTrue(download("test")) self.assertIsNotNone(remove("test")) # remove existing self.assertIsNotNone(download(name="test", version="0.0.6")) self.assertIsNotNone(download(name="test", version="0.0.7")) self.assertIsNotNone(download(name="test", version="0.0.8")) self.assertIsNotNone(download(name="test", version="0.0.9")) self.assertIsNotNone(download(name="test", version="0.0.10")) with self.assertRaises(Exception) as context: self.assertIsNotNone(download(name="test", version="0.0.11")) self.assertTrue( "Hash does not match expected." in str(context.exception)) self.assertIsNotNone(download(name="test", version="0.1")) self.assertIsNotNone(remove("test"))
def is_stopword(word: str) -> bool: # เช็คว่าเป็นคำฟุ่มเฟือย """ Check if a word is stop word or not using PyThaiNLP Reference ---------- Pythainlp, https://github.com/PyThaiNLP/pythainlp """ return word in thai_stopwords()
def test_corpus(self): self.assertIsNotNone(countries()) self.assertIsNotNone(provinces()) self.assertIsNotNone(thai_negations()) self.assertIsNotNone(thai_stopwords()) self.assertIsNotNone(thai_syllables()) self.assertIsNotNone(thai_words()) download("test") self.assertIsNotNone(remove("test")) self.assertIsNotNone(remove("tnc_freq"))
def filter_words(text): text = text.replace('\n', ' ') text = text.replace(',', ' ') stop_words = set(thai_stopwords()) tokens = word_tokenize(text, engine="newmm", keep_whitespace=False) filtered_text = [] for w in tokens: if w not in stop_words: filtered_text.append(w) return filtered_text
def test_corpus(self): self.assertIsNotNone(countries()) self.assertIsNotNone(provinces()) self.assertIsNotNone(thai_negations()) self.assertIsNotNone(thai_stopwords()) self.assertIsNotNone(thai_syllables()) self.assertIsNotNone(thai_words()) download("test") self.assertIsNotNone(remove("test")) self.assertIsNotNone(remove("tnc_freq"))
def test(): body = json.loads(request.get_data()) text = body['text'] try: custom_stopwords = body['custom_stopwords'] except KeyError: custom_stopwords = [""] try: custom_dict = body['custom_dict'] except KeyError: custom_dict = [""] #รับ input จาก user stop_words = list(thai_stopwords()) + list(STOPWORDS) + custom_stopwords map(lambda stop_words: stop_words.lower(), stop_words) #ส่วนนี้คือส่วนที่เราใส่คำที่ห้ามโชว์ขึ้นไปใน wordcloud pythainlp_words = thai_words() dictionary = list(pythainlp_words) + custom_dict #เพิ่มคำที่ไม่มีใน dict ของภาษาไทยหรือภาษาอังกฤษเข้าไปให้เป็นคำเช่นถ้าเรา input "ลุงตู่" จะออกมาเป็น "ลุง","ตู่" แต่ถ้าเราเพิ่ม dict เข้าไป output จะเป็น "ลุงตู่" tok = Tokenizer(dictionary) #ตั้งตัวแปรเพื่อแยกคำ text = tok.word_tokenize(text) text = ' '.join(text) text = text.lower() #ทำการแยกคำ wordcloud = WordCloud(stopwords=stop_words, font_path='THSarabunNew.ttf', min_word_length=2, relative_scaling=1.0, min_font_size=1, background_color="black", width=800, height=600, scale=10, font_step=1, collocations=False, colormap="gist_ncar", regexp=r"[\u0E00-\u0E7Fa-zA-Z']+", margin=2).generate(text) #ทำการ generate wordcloud plt.figure(figsize=(16, 9)) plt.imshow(wordcloud, cmap=plt.cm.gray, interpolation='bilinear') plt.axis("off") #ทำการวาง wordcloud wordcloud.to_file('wordcloud.png') gc.collect() #เซฟรูปลง server และคลีนแรม return send_file('wordcloud.png')
def test_corpus(self): self.assertIsNotNone(countries()) self.assertIsNotNone(provinces()) self.assertIsNotNone(thai_negations()) self.assertIsNotNone(thai_stopwords()) self.assertIsNotNone(thai_syllables()) self.assertIsNotNone(thai_words()) self.assertIsNotNone(thai_female_names()) self.assertIsNotNone(thai_male_names()) self.assertEqual(get_corpus_db_detail("XXX"), {}) self.assertIsNone(download("test")) self.assertIsNone(download("test", force=True)) self.assertIsNotNone(get_corpus_db_detail("test")) self.assertIsNotNone(remove("test")) self.assertFalse(remove("test"))
def word_freq_nostop(self, topn=30): count = collections.Counter() for line in self.lines: tokens = line.split(' ')[1:] # exclude id id = line.split(' ')[0] for token in tokens: if token != ' ' and token != '' and token not in corpus.thai_stopwords() and token not in ['(',')']: count[token] += 1 # print topn: rank, word, tokens, tokens/10K print(sum(count.values())) most = count.most_common(topn) for i in range(topn): print('| {} | {} | {} | {:.3f} |'.format(i+1, most[i][0], most[i][1], most[i][1]/1734185*10000)) # return counter return count
def trainModel(training_data): import nltk.classify from sklearn.svm import LinearSVC from pythainlp.tokenize import word_tokenize from pythainlp.corpus import thai_stopwords from itertools import chain print('split word ...') vocabulary = set( chain(*[(set(word_tokenize(i[0])) - set(thai_stopwords())) for i in training_data])) #vocabulary = set(chain(*[x for x in a if x not in [list(set(word_tokenize(i[0]))) for i in training_data]])) print('exact feature ...') feature_set = [({i: (i in word_tokenize(sentence)) for i in vocabulary}, tag) for sentence, tag in training_data] print('train model ...') classifier = nltk.classify.SklearnClassifier(LinearSVC()) classifier.train(feature_set) saveModel(vocabulary, classifier)
def make_tfidf(self, stop=False): self.tf_dic = {} self.idf_dic = {} for line in self.lines: if len(line.split()) > 1: document = line.split()[0] tokens = line.split()[1:] if document not in self.tf_dic: self.tf_dic[document] = {} for token in tokens: # tf if not (stop and token in corpus.thai_stopwords()): self.tf_dic[document][token] = self.tf_dic[document].get(token, 0) + 1 # idf if token not in self.idf_dic: self.idf_dic[token] = set(document) else: self.idf_dic[token].add(document) self.N = len(self.tf_dic.keys())
def trainModel(training_data): import logging logging.getLogger().setLevel(logging.INFO) import nltk.classify from sklearn.svm import LinearSVC from pythainlp.tokenize import word_tokenize from pythainlp.corpus import thai_stopwords from itertools import chain logging.warning(' ============ TRAINNING MODEL ============') logging.info('split word ...') vocabulary = set( chain(*[(set(word_tokenize(i[0])) - set(thai_stopwords())) for i in training_data])) #vocabulary = set(chain(*[x for x in a if x not in [list(set(word_tokenize(i[0]))) for i in training_data]])) logging.info('exact feature ...') feature_set = [({i: (i in word_tokenize(sentence)) for i in vocabulary}, tag) for sentence, tag in training_data] classifier = nltk.classify.SklearnClassifier(LinearSVC()) classifier.train(feature_set) saveModel(vocabulary, classifier)
def split_word(text): """ Split word to token and Remove stop word """ train_text = "" #Remove special charecter pattern = re.compile(r"[^\u0E00-\u0E7Fa-zA-Z' ]|^'|'$|''") remove_char = re.findall(pattern, text) list_with_removed_char = [char for char in text if not char in remove_char] train_text = ''.join(list_with_removed_char) #Split word by using for Maximum Matching algorithm, tokens = word_tokenize(train_text, engine='newmm') #Remove thai stop word and eng stop word stopped_tokens = [i for i in tokens if not i in thai_stopwords() and i not in get_stop_words('en')] #Word stemming # stemmed_tokens = [PorterStemmer().stem(i) for i in stopped_tokens] #Remove ETC deletelist = [' ',' ',' ', ' ', '\n', '\xa0','\x0c', "'",'cid'] tokens = [i for i in stopped_tokens if not i in deletelist] return tokens
def __init__(self, df, n_docs=1000, stoplist=thai_stopwords(), smooth_idf=True, sentence_tokenize=pythai.tokenize.sent_tokenize, word_tokenize=pythai.tokenize.word_tokenize, preprocessor=custom_preprocess): texts = df['body_text'] self.sentence_tokenize = sentence_tokenize self.word_tokenize = word_tokenize self.n_docs = n_docs self.stoplist = stoplist self.smooth_idf = smooth_idf self.preprocessor = preprocessor count_vect = self.get_new_countvect() self.docs_word_freq = count_vect.fit_transform( texts).toarray() # (docs, words) self.docs_word_freq = np.where(self.docs_word_freq > 0, 1, 0) self.docs_word_freq = np.sum(self.docs_word_freq, axis=0) # self.docs_word_freq /= np.sum(self.docs_word_freq) self.docs_vocab = count_vect.vocabulary_
from flask import Flask, request, json import pandas as pd from gensim.models import Word2Vec from pythainlp.tokenize import word_tokenize import pythainlp.corpus as st import numpy as np import pymongo from bson.objectid import ObjectId client = pymongo.MongoClient( "mongodb+srv://tum123456:[email protected]/<dbname>?retryWrites=true&w=majority" ) db = client.test mydb = db["MyProject"] words = st.thai_stopwords() data = pd.read_excel("Book1.xlsx", index_col=0) word_not_important = [ 'หา', 'รับ', 'งาน', 'ทำหน้าที่', 'หน้าที่', 'ซ่อม', '(', ')', '/' ] app = Flask(__name__) def lower(x): return x.lower() def concerned_sentence(search, allsentence): arr = [] for i in search: for j in allsentence:
# ลบ เครื่องหมายคำพูด (punctuation) for c in string.punctuation: msg = re.sub(r'\{}'.format(c), '', msg) # ลบ separator เช่น \n \t msg = ' '.join(msg.split()) return msg clean_text = [clean_msg(txt) for txt in text_tweets] # โหลดตัว stop word ทั้งไทยและอังกฤษ nltk.download('words') th_stop = tuple(thai_stopwords()) en_stop = tuple(get_stop_words('en')) p_stemmer = PorterStemmer() # ตัดคำ def split_word(text): # ตัดคำโดยใช้ dict ใน corpus ที่ผม edit ไป มันจะตัดเฉพาะเมนูอาหารที่ผมใส่ไปใน words.th.txt tokens = word_tokenize(text, engine='dict') # Remove stop words ภาษาไทย และภาษาอังกฤษ tokens = [i for i in tokens if not i in th_stop and not i in en_stop] # หารากศัพท์ภาษาไทย และภาษาอังกฤษ # English tokens = [p_stemmer.stem(i) for i in tokens]
# -*- coding: utf-8 -*- from pythainlp.tokenize import word_tokenize, dict_trie from pythainlp.corpus import thai_stopwords, thai_words, tnc from pythainlp.util import normalize import data stopwords = list(thai_stopwords()) thaiword = list(thai_words()) #tnc1=[word for word,i in tnc.word_freqs()] thaiword.remove("กินข้าว") datadict = dict_trie( list(set(data.ccc + thaiword + stopwords + data.conjunctions))) #+tnc1))) def wordcut(word): global datadict return word_tokenize(word, custom_dict=datadict)
from preprocess import normalize_thai_number, normalize_number, remove_markup_tag, normalize_link, normalize_mention from preprocess import normalize_email, normalize_laugh, unescape_html, normalize_emoji, extract_hashtag from preprocess import normalize_hashtag, replace_with_actual_hashtag, tokenize, _return_token import pickle from pythainlp.corpus import thai_stopwords from string import punctuation stopwords = thai_stopwords() punctuation += '“” ️' filename = 'models/tfidf_lr.pkl' pipeline = pickle.load(open(filename, 'rb')) def transform(text): text = text.lower() text = normalize_thai_number(text) text = unescape_html(text) text = remove_markup_tag(text) text = normalize_link(text) text = normalize_mention(text) text = normalize_email(text) text = normalize_laugh(text) text = normalize_number(text, place_holder='') text = normalize_emoji(text) hashtags = extract_hashtag(text) text = normalize_hashtag(text, place_holder='') tokens = tokenize(text, stopwords=None, punctuation=punctuation) tokens = replace_with_actual_hashtag(tokens, hashtags) return tokens
# Disable Warning warnings.filterwarnings('ignore') # List Data test Model_Test = [] # Dictionary data. Class_Properties = {} # Chrome setting (change mode headless and visible). Chrome_options = True # When program finish -> Export JSON. Export_JSON = True # Scaning qualify class Qualify = True # Loading target website success and wait 5 sec. Delay = 2 # Data stop word (en / th). data_stopwords = list(stopwords.words('english')) + list(thai_stopwords()) # Pattern Address address = [ 'average', 'dept', 'size', 'weight', 'height', 'word', 'a', 'div', 'em', 'h1', 'h2', 'h3', 'h4', 'h5', 'label', 'li', 'p', 'section', 'span', 'strong', 'td', 'tr', 'ul' ] stringContent = "" # Create api with fastapi app = FastAPI() # Navigation @app.get("/") async def main():
def _is_stopword(word: str) -> bool: # check thai stopword return word in thai_stopwords()
from wordcloud import WordCloud, STOPWORDS from flask import Flask, request, send_file import json import gc import os def word_preparing(words): '''ทำ list ของ words ให้ unique และทำ word เป็น lower-case''' words = set(words) # drop duplicate word, unique word return [word.lower() for word in words] # to lower case # get dictionary and stopword corpus DEFAULT_DICT = list(thai_words()) DEFAULT_STOPWORLS = list(thai_stopwords()) + list(STOPWORDS) # word preparing DEFAULT_DICT = word_preparing(DEFAULT_DICT) DEFAULT_STOPWORLS = word_preparing(DEFAULT_STOPWORLS) IMAGE_FILE = "wordcloud.png" # ชื่อไฟล์ที่จะเซฟรูป wordcloud app = Flask(__name__) @app.route("/wordcloud", methods=["POST"]) def gen_worldcloud(): # get text, custom_stopwords and custom_dict body = json.loads(request.get_data()) text = body['text']
def remove_stopwords(sentence): words = list( filter(lambda word: not word in thai_stopwords(), word_tokenize(sentence))) return ''.join(words)
teim text & tokenize """ text = html.unescape(text) text = re.sub(r'(\n|\t|\xa0)', ' ', text) text = re.sub(r'(\r|\u200b)', '', text) text = re.sub(r'\bhttps?://\S*\b', '', text) text = re.sub(r' +', ' ', text) text = re.sub(r'[\'\"‘’“”`\)\(]', '', text) return wt(text.strip(' '), keep_whitespace=False) def cossim(v1, v2) -> float: return np.dot(v1, v2) / np.linalg.norm(v1) / np.linalg.norm(v2) stopwords = corpus.thai_stopwords() class NewsAnalyze: def __init__(self, path: str, publisher: str ): # publisher: thairath, matichon, dailynews, sanook, nhk self.publisher = publisher self.path = f'{path}/{publisher}/' self.tokenized = sorted( glob.glob(self.path + 'tokenized/*.tsv')) # list of tokenized file def tokenize(self, n=5): jsons = set(glob.glob(self.path + '*.json')) # all json files tokenized_txt = { f.replace('/tokenized/', '/').split('tokenized')[0] + '.json' for f in self.tokenized
# -*- coding: utf-8 -*- from collections import Counter from typing import Dict, List from pythainlp.corpus import thai_stopwords _STOPWORDS = thai_stopwords() def rank(words: List[str], exclude_stopwords: bool = False) -> Counter: """ Sort words by frequency :param list words: a list of words :param bool exclude_stopwords: exclude stopwords :return: Counter """ if not words: return None if exclude_stopwords: words = [word for word in words if word not in _STOPWORDS] return Counter(words) def find_keyword(word_list: List[str], min_len: int = 3) -> Dict[str, int]: """ :param list word_list: a list of words :param int min_len: a mininum length of keywords to look for :return: dict
def _is_stopword(word: str) -> bool: # เช็คว่าเป็นคำฟุ่มเฟือย return word in thai_stopwords()
vecFromFirst = allCoord - firstPoint scalarProduct = np.sum(vecFromFirst * np.matlib.repmat(lineVecNorm, nPoints, 1), axis=1) vecFromFirstParallel = np.outer(scalarProduct, lineVecNorm) vecToLine = vecFromFirst - vecFromFirstParallel distToLine = np.sqrt(np.sum(vecToLine**2, axis=1)) idxOfBestPoint = np.argmax(distToLine) print(f'Optimum number of cluster by Elbow method: {idxOfBestPoint}') return idxOfBestPoint except: print(' cant find best k') # function for preprocessing word = thai_stopwords() def clean_tag(text): text = re.sub('<.*?>', '', text) return text def complete_clean(text: str): # Table for emoticon emoji_pattern = re.compile( "[" u"\U0001F600-\U0001F64F" # emoticons u"\U0001F300-\U0001F5FF" # symbols & pictographs u"\U0001F680-\U0001F6FF" # transport & map symbols
def _is_stopword(word: str) -> bool: # เช็คว่าเป็นคำฟุ่มเฟือย return word in thai_stopwords()
# -*- coding: utf-8 -*- from collections import Counter from typing import Dict, List from pythainlp.corpus import thai_stopwords _STOPWORDS = thai_stopwords() def rank(words: List[str], exclude_stopwords: bool = False) -> Counter: """ Count word frequecy given a list of Thai words with an option to exclude stopwords. :param list words: a list of words :param bool exclude_stopwords: If this parameter is set to **True** to exclude stopwords from counting. Otherwise, the stopwords will be counted. By default, `exclude_stopwords`is set to **False** :return: a Counter object representing word frequency from the text :rtype: :class:`collections.Counter` :Example: Include stopwords in counting word frequency:: from pythainlp.util import rank words = ["บันทึก", "เหตุการณ์", " ", "มี", "การ", "บันทึก", \\ "เป็น", " ", "ลายลักษณ์อักษร"]
import pandas import time from progressbar import progressbar import gc import codecs from pythainlp import word_tokenize, Tokenizer from pythainlp.corpus import thai_stopwords, thai_words import matplotlib.pyplot as plt from wordcloud import WordCloud, STOPWORDS from flask import Flask, request, send_file, after_this_request, render_template, redirect import numpy as np import random import matplotlib stop_words = list(thai_stopwords()) + list(STOPWORDS) +\ ["฿","ly","pic","co","th","https","com","youtu","http","www","twitter","html","bit"] map(lambda stop_words: stop_words.lower(), stop_words) pythainlp_words = thai_words() custom_dict = [ 'โคโรนา', 'ลุงตู่', 'โควิด', 'โคโรน่า', 'เจลล้างมือ', 'ขบวนเสด็จ' ] dictionary = list(pythainlp_words) + list(custom_dict) tok = Tokenizer(dictionary) class main_flask(): app = Flask(__name__)
from nltk import NaiveBayesClassifier as nbc import pickle from pythainlp.tokenize import word_tokenize from pythainlp.corpus import thai_stopwords import codecs from itertools import chain a = thai_stopwords() # pos.txt with codecs.open('pos.txt', 'r', "utf-8") as f: lines = f.readlines() listpos=[e.strip() for e in lines] del lines f.close() # ปิดไฟล์ # neg.txt with codecs.open('neg.txt', 'r', "utf-8") as f: lines = f.readlines() listneg=[e.strip() for e in lines] f.close() # ปิดไฟล์ pos1=['pos']*len(listpos) neg1=['neg']*len(listneg) training_data = list(zip(listpos,pos1)) + list(zip(listneg,neg1)) vocabulary = set(chain(*[(set(word_tokenize(i[0]))-set(thai_stopwords())) for i in training_data])) #vocabulary = set(chain(*[x for x in a if x not in [list(set(word_tokenize(i[0]))) for i in training_data]])) feature_set = [({i:(i in word_tokenize(sentence)) for i in vocabulary},tag) for sentence, tag in training_data] classifier = nbc.train(feature_set)
from pythainlu.intent_classification import naive_bayes, MultinomialNB from pythainlp.tokenize import word_tokenize from sklearn.model_selection import train_test_split file = "../dataset/data-nottag.set" import pandas as pd colnames = ['text', 'tag'] from pythainlp.tokenize import word_tokenize, Trie from pythainlp.corpus import thai_stopwords # ... #filtered_words = [word for word in word_list if word not in list(thai_stopwords())] o = Trie(list(thai_stopwords())) def filtered_words(x: tuple): #w = .lower()#word_tokenize(x[0],custom_dict=o) #ww = [word for word in w if word not in list(thai_stopwords())] return (x[0].lower().strip(), x[1]) #(''.join(ww),x[1]) user1 = pd.read_csv(file, names=colnames, header=None, sep="|") data = [filtered_words(tuple(x)) for x in user1.to_records(index=False)] def features(text): wordlist = word_tokenize(text) f = {} if "แจ้งเตือน" in text or 'เตือน' in text: f['a'] = True