def text_to_item(self, raw_text): res = {} text = self.replace_word(raw_text) words = dict_word_tokenize(text, self.keyword_dict) new_w = [] for w in words: if w in self.keyword: new_w.append(w) else: word = word_tokenize(w) [new_w.append(i) for i in word] words = new_w print(words) items_translated_2 = self.translate2(words) # items = self.split_item(words) # items_translated_1 = [] # for item in items: # item = self.translate1(item) # items_translated_1.append(item) # if items_translated_2 != items_translated_1: # print(raw_text) # print(items_translated_1) # print(items_translated_2) if (items_translated_2 == []): res["status"] = "fail to processing \"{}\".".format(raw_text) else: res["status"] = "process word \"{}\" complete.".format(raw_text) res["item"] = items_translated_2 return res
def word_tokenize_to_g2p(text): wordall = dict_word_tokenize(text, custom_dict_trie=DEFAULT_DICT_TRIE) list = [] for a in wordall: try: list.append(data[a]) #romanization(a,engine='pyicu')) except: word_list_icu = word_tokenize(a, engine="icu") for b in word_list_icu: list.append(romanization(b, engine='pyicu')) return '|'.join(list)
def word_tokenize_to_g2p(text): wordall=dict_word_tokenize(text, custom_dict=DEFAULT_DICT_TRIE) list=[] for a in wordall: try: list.append(data[a])#romanization(a,engine='pyicu')) except: word_list_icu=word_tokenize(a) for b in word_list_icu: list.append(romanization(b)) return '|'.join(list).replace("-","|").split('|')
def test_dict_word_tokenize(self): self.assertEqual(dict_word_tokenize("", custom_dict=FROZEN_DICT_TRIE), []) self.assertIsNotNone( dict_word_tokenize("รถไฟฟ้ากรุงเทพBTSหูว์ค์", custom_dict=FROZEN_DICT_TRIE) ) self.assertIsNotNone(dict_trie(())) self.assertIsNotNone( dict_word_tokenize( "รถไฟฟ้ากรุงเทพBTSหูว์ค์", custom_dict=FROZEN_DICT_TRIE, engine="newmm" ) ) self.assertIsNotNone( dict_word_tokenize( "รถไฟฟ้ากรุงเทพBTSหูว์ค์", custom_dict=FROZEN_DICT_TRIE, engine="longest", ) ) self.assertIsNotNone( dict_word_tokenize( "รถไฟฟ้ากรุงเทพBTSหูว์ค์", custom_dict=FROZEN_DICT_TRIE, engine="mm" ) ) self.assertIsNotNone( dict_word_tokenize( "รถไฟฟ้ากรุงเทพBTSหูว์ค์", custom_dict=FROZEN_DICT_TRIE, engine="XX" ) )
def test_word_tokenize(self): self.assertEqual(word_tokenize(""), []) self.assertEqual( word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย"), ["ฉัน", "รัก", "ภาษาไทย", "เพราะ", "ฉัน", "เป็น", "คนไทย"], ) self.assertIsNotNone( word_tokenize("หมอนทองตากลมหูว์MBK39", engine="newmm") ) self.assertIsNotNone( word_tokenize("หมอนทองตากลมหูว์MBK39", engine="mm") ) self.assertIsNotNone( word_tokenize("หมอนทองตากลมหูว์MBK39", engine="longest") ) self.assertIsNotNone( word_tokenize("หมอนทองตากลมหูว์MBK39", engine="icu") ) self.assertIsNotNone( word_tokenize("หมอนทองตากลมหูว์MBK39", engine="deepcut") ) self.assertIsNotNone( word_tokenize("หมอนทองตากลมหูว์MBK39", engine="attacut") ) self.assertIsNotNone( word_tokenize("หมอนทองตากลมหูว์MBK39", engine="XX") ) # XX engine is not existed self.assertIsNotNone(dict_trie(())) self.assertIsNotNone(dict_trie(("ทดสอบ", "สร้าง", "Trie"))) self.assertIsNotNone(dict_trie(["ทดสอบ", "สร้าง", "Trie"])) self.assertIsNotNone(dict_trie({"ทดสอบ", "สร้าง", "Trie"})) self.assertIsNotNone(dict_trie(thai_words())) self.assertIsNotNone(dict_trie(DEFAULT_DICT_TRIE)) self.assertIsNotNone( dict_trie(os.path.join(_CORPUS_PATH, _THAI_WORDS_FILENAME)) ) self.assertTrue( "ไฟ" in word_tokenize("รถไฟฟ้า", custom_dict=dict_trie(["ไฟ"])) ) # Commented out until this unittest bug get fixed: # https://bugs.python.org/issue29620 # with self.assertWarns(DeprecationWarning): # dict_word_tokenize("เลิกใช้แล้ว", custom_dict=DEFAULT_DICT_TRIE) self.assertEqual( word_tokenize("รถไฟฟ้า", custom_dict=dict_trie(["ไฟ"])), dict_word_tokenize("รถไฟฟ้า", custom_dict=dict_trie(["ไฟ"])), )
def run(self, text): self.sound = None self.text_cut = dict_word_tokenize(text, Trie(self.word_list)) self.i = 0 # ประกาศตัวแปร i เพื่อใช้ในการลูป self.num_sound = 0 # ประกาศตัวแปร num_sound สำหรับเก็บจำนวนเสียงที่ถูกรวม while self.i < len(self.text_cut): if self.text_cut[ self. i] in self.word_list and self.num_sound == 0: # ถ้ามีคำนี้ในระบบและเป็นคำแรกที่รวม self.sound = AudioSegment.from_wav( "data/" + self.data_file[self.text_cut[self.i]] + ".wav") # ให้ self.sound แทนไฟล์เสียงที่ถูกตัดไฟล์แรก self.num_sound += 1 elif self.text_cut[self.i] in self.word_list: self.sound += AudioSegment.from_wav( "data/" + self.data_file[self.text_cut[self.i]] + ".wav") # ทำการรวมไฟล์เสียงเข้าไป self.num_sound += 1 self.i += 1 self.sound.export(self.file, format="wav")
# -*- coding: utf-8 -*- from pythainlp.tokenize import dict_word_tokenize, create_custom_dict_trie data = create_custom_dict_trie("wordlist.txt") while True: text = input("text : ") print(dict_word_tokenize(text, custom_dict_trie=data, engine="newmm")) print("\r\n")
อย่างหนึ่ง""".split("\n") # หน้า 64 http://www.arts.chula.ac.th/~ling/thesis/2556MA-LING-Nalinee.pdf with codecs.open("corpus.txt", 'r',encoding='utf8') as f: lines1 = list(set(normalize(f.read()).splitlines())) f.close() test=True#False#True##เปิด/ปิดการ test #''' with codecs.open("thai.txt", "r",encoding="utf8") as f: lines2 = f.read().splitlines()#''' ''' from pythainlp.corpus.thaiword import get_data lines2 =get_data()''' data_all=[] thaiword=create_custom_dict_trie(list(set(ccc+lines2+stopwords+conjunctions))) print("จำนวนประโยค : "+str(len(lines1))) for lines in lines1: text=dict_word_tokenize(lines,thaiword) #text=word_tokenize(lines,thai_tokenize) data_all.append(text) sents=data_all tokens = [] boundaries = set() offset = 0 def check_punctuation(text): for i in text: if i in list(set(punctuation)): return True return False def num_there(s): return any(i.isdigit() for i in s) for sent in sents: tokens.extend(sent)
def segment(txt): return dict_word_tokenize(text=txt, data=get_data(filename) + get_data(filename2), data_type="list", engine="newmm")
# -*- coding: utf-8 -*- import sqlite3 connection = sqlite3.connect('db.sqlite3') cursor = connection.execute('select word from word') wordlist=[i[0] for i in cursor.fetchall()] #print('\n'.join(wordlist)) print("จำนวนคำ : "+str(len(wordlist))) connection.close() from pythainlp.tokenize import dict_word_tokenize,create_custom_dict_trie,word_tokenize dictthai=create_custom_dict_trie(wordlist) while True: text=input("text : ") if text=="exit": break print("ผลจาก dict : \t"+'|'.join(dict_word_tokenize(text,dictthai))) print("ผลจาก PyThaiNLP : \t"+'|'.join(word_tokenize(text)))
def test_dict_word_tokenize(self): self.assertEqual(dict_word_tokenize(""), [])
อย่างหนึ่ง""".split("\n") # หน้า 64 http://www.arts.chula.ac.th/~ling/thesis/2556MA-LING-Nalinee.pdf with codecs.open("corpus.txt", 'r',encoding='utf8') as f: lines1 = list(set(normalize(f.read()).splitlines())) f.close() test=False#True##เปิด/ปิดการ test #''' with codecs.open("thai.txt", "r",encoding="utf8") as f: lines2 = f.read().splitlines()#''' ''' from pythainlp.corpus.thaiword import get_data lines2 =get_data()''' data_all=[] thaiword=create_custom_dict_trie(list(set(ccc+lines2+stopwords+conjunctions))) print("จำนวนประโยค : "+str(len(lines1))) for lines in lines1: text=dict_word_tokenize(lines,thaiword) #text=word_tokenize(lines,thai_tokenize) data_all.append(text) sents=data_all tokens = [] boundaries = set() offset = 0 def check_punctuation(text): for i in text: if i in list(set(punctuation)): return True return False def num_there(s): return any(i.isdigit() for i in s) for sent in sents: tokens.extend(sent)