def test_word_tokenize(self): self.assertEqual(word_tokenize(""), []) self.assertEqual( word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย"), ["ฉัน", "รัก", "ภาษาไทย", "เพราะ", "ฉัน", "เป็น", "คนไทย"], ) self.assertIsNotNone(word_tokenize("หมอนทองตากลมหูว์MBK39", engine="newmm")) self.assertIsNotNone(word_tokenize("หมอนทองตากลมหูว์MBK39", engine="mm")) self.assertIsNotNone(word_tokenize("หมอนทองตากลมหูว์MBK39", engine="longest")) self.assertIsNotNone(word_tokenize("หมอนทองตากลมหูว์MBK39", engine="ulmfit")) self.assertIsNotNone(word_tokenize("หมอนทองตากลมหูว์MBK39", engine="icu")) self.assertIsNotNone(word_tokenize("หมอนทองตากลมหูว์MBK39", engine="deepcut")) self.assertIsNotNone(word_tokenize("หมอนทองตากลมหูว์MBK39", engine="XX")) self.assertIsNotNone(dict_trie(())) self.assertIsNotNone(dict_trie(("ทดสอบ", "สร้าง", "Trie"))) self.assertIsNotNone(dict_trie(["ทดสอบ", "สร้าง", "Trie"])) self.assertIsNotNone(dict_trie(thai_words())) self.assertIsNotNone(dict_trie(FROZEN_DICT_TRIE)) self.assertIsNotNone(word_tokenize("รถไฟฟ้าBTS", custom_dict=DEFAULT_DICT_TRIE)) self.assertIsNotNone( word_tokenize("ทดสอบ", engine="deepcut", custom_dict=FROZEN_DICT_TRIE) ) self.assertIsNotNone( word_tokenize("ทดสอบ", engine="XX", custom_dict=FROZEN_DICT_TRIE) )
def test_word_tokenize(self): self.assertEqual(word_tokenize(""), []) self.assertEqual( word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย"), ["ฉัน", "รัก", "ภาษาไทย", "เพราะ", "ฉัน", "เป็น", "คนไทย"], ) self.assertIsNotNone( word_tokenize("หมอนทองตากลมหูว์MBK39", engine="newmm") ) self.assertIsNotNone( word_tokenize("หมอนทองตากลมหูว์MBK39", engine="mm") ) self.assertIsNotNone( word_tokenize("หมอนทองตากลมหูว์MBK39", engine="longest") ) self.assertIsNotNone( word_tokenize("หมอนทองตากลมหูว์MBK39", engine="icu") ) self.assertIsNotNone( word_tokenize("หมอนทองตากลมหูว์MBK39", engine="deepcut") ) self.assertIsNotNone( word_tokenize("หมอนทองตากลมหูว์MBK39", engine="attacut") ) self.assertIsNotNone( word_tokenize("หมอนทองตากลมหูว์MBK39", engine="XX") ) # XX engine is not existed self.assertIsNotNone(dict_trie(())) self.assertIsNotNone(dict_trie(("ทดสอบ", "สร้าง", "Trie"))) self.assertIsNotNone(dict_trie(["ทดสอบ", "สร้าง", "Trie"])) self.assertIsNotNone(dict_trie({"ทดสอบ", "สร้าง", "Trie"})) self.assertIsNotNone(dict_trie(thai_words())) self.assertIsNotNone(dict_trie(DEFAULT_DICT_TRIE)) self.assertIsNotNone( dict_trie(os.path.join(_CORPUS_PATH, _THAI_WORDS_FILENAME)) ) self.assertTrue( "ไฟ" in word_tokenize("รถไฟฟ้า", custom_dict=dict_trie(["ไฟ"])) ) # Commented out until this unittest bug get fixed: # https://bugs.python.org/issue29620 # with self.assertWarns(DeprecationWarning): # dict_word_tokenize("เลิกใช้แล้ว", custom_dict=DEFAULT_DICT_TRIE) self.assertEqual( word_tokenize("รถไฟฟ้า", custom_dict=dict_trie(["ไฟ"])), dict_word_tokenize("รถไฟฟ้า", custom_dict=dict_trie(["ไฟ"])), )
def createBOW(ls_txt, corpus): custom_dict = set(thai_words()) word = ['ราเมง', 'อิเกีย', 'คาปูชิโน่', 'น้ำมัน', 'หอยลาย', 'ปุ้มปุ้ย'] for i in word: custom_dict.add(i) trie = dict_trie(dict_source=custom_dict) BOW_t = [list() for i in range(len(ls_txt))] l = 0 for i in ls_txt: tmp = word_tokenize(i, engine='dict', custom_dict=trie) for j in corpus: if j in tmp: BOW_t[l].append(tmp.count(j)) tmp.remove(j) else: BOW_t[l].append(0) if len(tmp) != 0: BOW_t[l].append(len(tmp)) elif len(tmp) == 0: BOW_t[l].append(0) l += 1 # corpus_t = corpus.append('Other') # ch = pd.DataFrame({ # 'train':corpus, # 'target':BOW_t[0] # }) # ch # predictiontree = dtree.predict(BOW_t) return list(BOW_t)
def tokenize_text_list_test(ls): print("working on") li = [ 'cfcut', 'deepcut', 'etcc', 'longest', 'multi_cut', 'newmm', 'ssg', 'tcc', 'trie' ] # li=['cfcut','newmm'] custom_dict = set(thai_words()) trie = dict_trie(dict_source=custom_dict) p, q = [], [] for x in li: start = time.process_time() if x == 'deepcut': g = list( chain.from_iterable([ pythainlp.tokenize.word_tokenize(l, engine=x) for l in ls ])) else: g = list( chain.from_iterable([ pythainlp.tokenize.word_tokenize(l, engine=x, custom_dict=trie) for l in ls ])) p.append(g) # print(g) tim = time.process_time() - start q.append(tim) return p, q
def test_dict_word_tokenize(self): self.assertEqual(dict_word_tokenize("", custom_dict=FROZEN_DICT_TRIE), []) self.assertIsNotNone( dict_word_tokenize("รถไฟฟ้ากรุงเทพBTSหูว์ค์", custom_dict=FROZEN_DICT_TRIE) ) self.assertIsNotNone(dict_trie(())) self.assertIsNotNone( dict_word_tokenize( "รถไฟฟ้ากรุงเทพBTSหูว์ค์", custom_dict=FROZEN_DICT_TRIE, engine="newmm" ) ) self.assertIsNotNone( dict_word_tokenize( "รถไฟฟ้ากรุงเทพBTSหูว์ค์", custom_dict=FROZEN_DICT_TRIE, engine="longest", ) ) self.assertIsNotNone( dict_word_tokenize( "รถไฟฟ้ากรุงเทพBTSหูว์ค์", custom_dict=FROZEN_DICT_TRIE, engine="mm" ) ) self.assertIsNotNone( dict_word_tokenize( "รถไฟฟ้ากรุงเทพBTSหูว์ค์", custom_dict=FROZEN_DICT_TRIE, engine="XX" ) )
def __init__(self): self.menus = [] #self.menu = {} self.normwords = [] self.lang_convert_th_eng = { "ร้อน": "Hot", "เย็น": "Ice", "ปั่น": "Frappe", } self.word_filter = [ "ร้อน", "เย็น", "ปั่น", "แก้ว", "ใหญ่", "หวาน", "น้ำตาล", "นม", "นมข้น", "วิปครีม", "วิป", "ไม่", "ใส่", "น้อย", "ปกติ", "กลาง", "มาก", "เยอะ", "เพิ่ม" ] with open("other_module/menu.csv", encoding='utf-8') as csvfile: reader = csv.reader(csvfile) for row in reader: # each row is a list self.lang_convert_th_eng[row[0]] = row[1] #self.menu.append([row[0],row[1]]) self.menus.append(row[0]) self.keyword = [item for item in self.menus if item != "ชา"] self.keyword.append("วิป") self.keyword.append("วิปครีม") self.keyword.append("ร้อน") self.keyword_dict = dict_trie(self.keyword) with open("other_module/normwords.csv", encoding='utf-8') as csvfile: reader = csv.reader(csvfile) for row in reader: # each row is a list self.normwords.append(row)
def main(): engineOption = ["newmm", "longest-matching", "dict", "ulmfit"] f = codecs.open('input.txt', encoding='utf-8') fsort = open("output-sort.csv", "w", encoding="utf-8") text = "" for line in f: # print (line) text = text + line custom_words_list = set(thai_words()) custom_words_list.add('รีเทนเนอร์') custom_words_list.add('จัดฟัน') custom_words_list.add('ฟันชิด') trie = dict_trie(dict_source=custom_words_list) _tokenizer = Tokenizer(custom_dict=trie, engine='newmm') print('------ Starting to tokenize words ------') # words = word_tokenize(text, engine=engineOption[0]) words = _tokenizer.word_tokenize(text) i = 0 wordsNew = "" for word in words: if word and (not word.isspace( )) and word != '-' and word != '/' and not word.isnumeric(): i = i + 1 # print(i , ': ' , word.strip() ) wordsNew = wordsNew + word.strip() + " " f.close() print('------ Starting to count words: ------') wordlist = wordsNew.split() wordfreq = [] for w in wordlist: wordfreq.append(wordlist.count(w.strip())) dictionary = wordListToFreqDict(wordlist) sorteddict = sortFreqDict(dictionary) i = i + 1 if (i % 150 == 0): print(".") else: print(".", end='') print('------ Starting to sort words and write to file ------') for s in sorteddict: print(s[1], "|", s[0]) fsort.write(s[1] + "|" + str(s[0])) fsort.write('\n') fsort.close()
def train(): df = pd.read_csv("Data/Expenses.csv") custom_dict = set(thai_words()) word = ['ราเมง', 'อิเกีย', 'คาปูชิโน่', 'น้ำมัน', 'หอยลาย', 'ปุ้มปุ้ย'] for i in word: custom_dict.add(i) trie = dict_trie(dict_source=custom_dict) corpus = [] for i in df.text: for j in word_tokenize(i, engine='dict', custom_dict=trie): if j not in corpus: corpus.append(j) BOW = [list() for i in range(len(df.text))] l = 0 count = 1 for i in df.text: tmp = word_tokenize(i, engine='dict', custom_dict=trie) for j in corpus: if j in tmp: BOW[l].append(tmp.count(j)) tmp.remove(j) else: BOW[l].append(0) if len(tmp) != 0: BOW[l].append(len(tmp)) elif len(tmp) == 0: BOW[l].append(0) l += 1 ytarget = df.cate xtrain = BOW dtree = DecisionTreeClassifier() dtree.fit(X=xtrain, y=ytarget) return corpus, dtree
def ProcessText(self, text): dataBase = database() streetDf, addressDf = dataBase.ReadStreetName() streetList = dataBase.DataframeToList(streetDf) addressList = dataBase.DataframeToList(addressDf) districtList = dataBase.districtName_ wordList = districtList + streetList + addressList custom_words_list = set(thai_words()) custom_words_list.update(wordList) custom_words_list.update(self.specWord) trie = dict_trie(dict_source=custom_words_list) custom_tokenizer = Tokenizer(custom_dict=trie, engine=self.engineSel) proc = custom_tokenizer.word_tokenize(text) cleanList_1 = [] cleanList = [] [ cleanList_1.append( i.translate(str.maketrans('', '', string.punctuation))) for i in proc ] [ cleanList.append(i.translate(str.maketrans('', '', '1234567890'))) for i in cleanList_1 ] procText = list(filter(lambda x: x != " ", proc)) procText = list(filter(lambda x: x != " ", procText)) procText = list(filter(lambda x: x != "", procText)) #procText = list(filter(lambda x: len(x)>2, procText)) joinText = ' '.join(procText) #print(joinText) return joinText
def syllable_tokenize_lu(text: str) -> List[str]: """Reference https://thainlp.org/pythainlp/docs/2.0/_modules/pythainlp/tokenize.html#syllable_tokenize""" if not text or not isinstance(text, str): return [] tokens = [] # Read lu syllable list with open(LU_SYLLABLE_FILENAME, 'r') as f: syllable_lu_dict = json.load(f) # Create custom dict trie for Lu lu_syllable = syllable_lu_dict['data'] dict_source = frozenset(set(lu_syllable)) trie = dict_trie(dict_source) if text: words = word_tokenize(text, custom_dict=trie) #print("lu", words) #dict_source = frozenset(set(lu_syllable).union(set(thai_syllables()))) for word in words: tokens.extend(word_tokenize(text=word, custom_dict=trie)) return tokens
def word_sylleble(text): tokens = [] if text: trie = dict_trie(dict_source=listtext) tokens.extend(onecut(trie, text=text)) return [tokens, text]
# -*- coding: utf-8 -*- from pythainlp.tokenize import word_tokenize, dict_trie from pythainlp.corpus import thai_stopwords, thai_words, tnc from pythainlp.util import normalize import data stopwords = list(thai_stopwords()) thaiword = list(thai_words()) #tnc1=[word for word,i in tnc.word_freqs()] thaiword.remove("กินข้าว") datadict = dict_trie( list(set(data.ccc + thaiword + stopwords + data.conjunctions))) #+tnc1))) def wordcut(word): global datadict return word_tokenize(word, custom_dict=datadict)