def proc_ques(ques, pretrain_name='airesearch/wangchanberta-base-att-spm-uncased', maxlen=416): tokenizer = CamembertTokenizerFast.from_pretrained(pretrain_name, model_max_length=maxlen) q = ques['question'] q = q.lower() q = normalize(q) return tokenizer(q, return_tensors="pt", padding='max_length') # ques_ix = np.zeros(max_token, np.int64) # words = re.sub( # r"([.,'!?\"()*#:;])", # '', # ques['question'].lower() # ).replace('-', ' ').replace('/', ' ').split() # for ix, word in enumerate(words): # if word in token_to_ix: # ques_ix[ix] = token_to_ix[word] # else: # ques_ix[ix] = token_to_ix['UNK'] # if ix + 1 == max_token: # break return ques_ix
def normalize_answer(s): """Lower text and remove punctuation, articles and extra whitespace.""" def remove_punc(text): exclude = set(string.punctuation) return "".join(ch for ch in text if ch not in exclude) def lower(text): return text.lower() return normalize(remove_punc(lower(s.strip()))).replace('\xa0', ' ')
def tokenize(stat_ques_list, pretrain_name='airesearch/wangchanberta-base-att-spm-uncased', maxlen=416): tokenizer = CamembertTokenizerFast.from_pretrained(pretrain_name) tokenized_dataset = [] for q in stat_ques_list: q = q['question'] q = q.lower() q = normalize(q) tokenized_dataset.append(tokenizer(q, padding='max_length')) return tokenized_dataset
def listen(self,text,play=True): text=normalize(text) self.list=word_tokenize(text) if play and self.engine=="thaitts": try: thaitts(" ".join(self.list),self.thaitts,"./t.wav") playsound('./t.wav') except: print("ไม่สามารถพูดได้ : "+str(text)) elif play and self.engine=="g": gTTS1(text,"t.mp3") playsound('t.mp3')
def process_corpus(corpus, number_token="<NUM>", oov_token="<OOV>"): # Create an empty dictionary and token list dictionary = {oov_token: 1} tokenized_corpus = [] corpus = corpus[1:1000] for entry in corpus: # Normalize the entry entry = normalize(entry) # Tokenize each entry tokens = np.array( tokenize.word_tokenize(entry, engine='newmm', keep_whitespace=False)) # Remove non-Thai words tokens = tokens[[ isthai(t, ignore_chars="0123456789") and t != "" for t in tokens ]] # Replace numbers with text #tokens = [re.sub("^\d*$",num2words(t, lang = 'th'),t) for t in tokens] # Add the tokens to the tokenized corpus tokenized_corpus.append(tokens) # Add the tokens to the dictionary and increment counts for t in tokens: if t in dictionary: dictionary[t] = dictionary[t] + 1 else: dictionary[t] = 1 return tokenized_corpus, dictionary
# -*- coding: utf-8 -*- from pythainlp.util import normalize print(normalize("เเปลก") == "แปลก") # เ เ ป ล ก กับ แปลก
def test_normalize(self): self.assertEqual(normalize("เเปลก"),"แปลก")
หรืออีก อย่างหนึ่ง หากแต่ว่า เหตุดังนั้น เหตุ นี้ เหมือนดังว่า อย่างไรก็ดี อย่างไรก็ ตาม อนึ่งคือว่า อีกประการหนึ่ง อีก อย่างหนึ่ง""".split("\n") # หน้า 64 http://www.arts.chula.ac.th/~ling/thesis/2556MA-LING-Nalinee.pdf with codecs.open("corpus.txt", 'r',encoding='utf8') as f: lines1 = list(set(normalize(f.read()).splitlines())) f.close() test=True#False#True##เปิด/ปิดการ test #''' with codecs.open("thai.txt", "r",encoding="utf8") as f: lines2 = f.read().splitlines()#''' ''' from pythainlp.corpus.thaiword import get_data lines2 =get_data()''' data_all=[] thaiword=create_custom_dict_trie(list(set(ccc+lines2+stopwords+conjunctions))) print("จำนวนประโยค : "+str(len(lines1))) for lines in lines1: text=dict_word_tokenize(lines,thaiword) #text=word_tokenize(lines,thai_tokenize) data_all.append(text)
def segment_sentences(words): start = 0 sents = [] num_true = 0.0 num_all = 0 for i, word in enumerate(words): dist = classifier.prob_classify(punct_features(words, i)) for label in dist.samples(): if label == True: num_true += dist.prob(label) if classifier.classify(punct_features(words, i)) == True and num_true > 0.60: sents.append(words[start:i + 1]) start = i + 1 if start < len(words): sents.append(words[start:]) #print(num_true/num_all) return sents while True: thai_sent = normalize(input("Text : ")) #thai_word=word_tokenize(thai_sent,thai_tokenize)# text_all = dict_word_tokenize(thai_sent, thaiword) #[] """temp=thai_sent.split(' ') for data in temp: thai_word=dict_word_tokenize(data,thaiword) text_all.extend(thai_word)""" #print(text_all) thai_sents = segment_sentences(text_all) print('sent : ' + '/'.join([''.join(i) for i in thai_sents]))
หรืออีก อย่างหนึ่ง หากแต่ว่า เหตุดังนั้น เหตุ นี้ เหมือนดังว่า อย่างไรก็ดี อย่างไรก็ ตาม อนึ่งคือว่า อีกประการหนึ่ง อีก อย่างหนึ่ง""".split("\n") # หน้า 64 http://www.arts.chula.ac.th/~ling/thesis/2556MA-LING-Nalinee.pdf with codecs.open("corpus.txt", 'r',encoding='utf8') as f: lines1 = list(set(normalize(f.read()).splitlines())) f.close() test=True#False#True##เปิด/ปิดการ test #''' with codecs.open("thai.txt", "r",encoding="utf8") as f: lines2 = f.read().splitlines()#''' ''' from pythainlp.corpus.thaiword import get_data lines2 =get_data()''' data_all=[] thaiword=create_custom_dict_trie(list(set(ccc+lines2+stopwords+conjunctions))) # print("จำนวนประโยค : "+str(len(lines1))) for lines in lines1: text=dict_word_tokenize(lines,thaiword) #text=word_tokenize(lines,thai_tokenize) data_all.append(text)
from pythainlp.util import normalize p = os.path.join(".", "text") listfile = [i for i in list(glob.glob(p + "/*.txt"))] def readfile(path): with open(path, "r", encoding="utf-8-sig") as f: return f.read() def writefile(path, data): with open(path, "w", encoding="utf-8") as f: f.write(data) def clean(data): rule = [("", "์"), ("", "่"), ("", "้"), ("", "ี"), ("", "็"), ("", "้"), ("", "่"), ("", "ิ"), ("", "ื"), ("", "ั"), ("", "๊"), (" ่", "่"), (" ้", "้"), (" ๋", "๋"), (" ๊", "๊"), (" ็", "็"), (" ั", "ั"), (" ู้", " ู้".replace(" ", "")), (" ื้", " ื้".replace(" ", "")), (" ์", "์"), (" ิ", "ิ"), (" ื", "ื"), (" ี่", " ี่".replace(" ", ""))] for i in rule: data = data.replace(i[0], i[1]) return data listdata = [normalize(clean(normalize(readfile(i)))) for i in listfile] for i, file in enumerate(listfile): writefile(file, listdata[i])
หรืออีก อย่างหนึ่ง หากแต่ว่า เหตุดังนั้น เหตุ นี้ เหมือนดังว่า อย่างไรก็ดี อย่างไรก็ ตาม อนึ่งคือว่า อีกประการหนึ่ง อีก อย่างหนึ่ง""".split("\n") # หน้า 64 http://www.arts.chula.ac.th/~ling/thesis/2556MA-LING-Nalinee.pdf with codecs.open("corpus.txt", 'r',encoding='utf8') as f: lines1 = list(set(normalize(f.read()).splitlines())) f.close() test=False#True##เปิด/ปิดการ test #''' with codecs.open("thai.txt", "r",encoding="utf8") as f: lines2 = f.read().splitlines()#''' ''' from pythainlp.corpus.thaiword import get_data lines2 =get_data()''' data_all=[] thaiword=create_custom_dict_trie(list(set(ccc+lines2+stopwords+conjunctions))) print("จำนวนประโยค : "+str(len(lines1))) for lines in lines1: text=dict_word_tokenize(lines,thaiword) #text=word_tokenize(lines,thai_tokenize) data_all.append(text)
def test_normalize(self): self.assertEqual(normalize('เเปลก'), 'แปลก')
def word_tokenization(text): tokenized = " ".join(word_tokenize(normalize(text), keep_whitespace=False)) return tokenized
# -*- coding: utf-8 -*- from pythainlp.util import normalize print(normalize("เเปลก")=="แปลก") # เ เ ป ล ก กับ แปลก
def test_normalize(self): self.assertEqual(normalize("เเปลก"), "แปลก") self.assertIsNotNone(normalize("พรรค์จันทร์ab์"))
def text_cleaning(texts): clean_word = [] stop_words = thai_stopwords() for text in texts: #emoji list pos_emoji = re.compile( u'[\U0001F600\U0001F603\U0001F604\U0001F601\U0001F606\U0001F60A\U0000263A\U0000FE0F\U0001F923\U0001F642\U0001F609\U0001F60C\U0001F619\U0001F617\U0001F618\U0001F970\U0001F60D\U0001F61A\U0001F60B\U0001F61B\U0001F61D\U0001F61C\U0001F973\U0001F60F\U0001F633\U0001F638\U0001F63A\U0001F63D\U0001F63B\U0001F63C\U0001F44D\U0001F3FB\U0001F91F\U0001F3FB\U0001F918\U0001F3FB\U0001F48B\U00002764\U0000FE0F\U0001F9E1\U0001F49B\U0001F49A\U0001F499\U0001F49C\U00002763\U0000FE0F\U0001F495\U0001F49E\U0001F493\U0001F497\U0001F496\U0001F498\U0001F49D]', flags=re.UNICODE) neg_emoji = re.compile( u'[\U0001F494\U0001F642\U0001F643\U0001F61E\U0001F612\U0001F60F\U0001F614\U0001F61F\U0001F615\U0001F641\U00002639\U0000FE0F\U0001F623\U0001F616\U0001F62B\U0001F629\U0001F97A\U0001F622\U0001F62D\U0001F60F\U0001F624\U0001F620\U0001F621\U0001F92C\U0001F92F\U0001F975\U0001F628\U0001F630\U0001F625\U0001F613\U0001F925\U0001F636\U0001F610\U0001F611\U0001F644\U0001F626\U0001F640\U0001F63E\U0001F63C\U0001F595\U0001F3FB\U0001F44E\U0001F3FB\U0001F9B6\U0001F3FB\U0001F448\U0001F3FB\U0001F91E\U0001F3FB\U0001F44B\U0001F3FB\U0001F47F\U0001F47A\U0001F921\U0001F92E\U0001F974\U0001F463]', flags=re.UNICODE) pos_count = len(re.findall(pos_emoji, text)) neg_count = len(re.findall(neg_emoji, text)) #text.replace('☺️', 'posemo') #for emo in pos_emoji: text = text.replace(emo,'posemo') emoji_pattern = re.compile( "[" u"\U0001F600-\U0001F64F" # emoticons u"\U0001F300-\U0001F5FF" # symbols & pictographs u"\U0001F680-\U0001F6FF" # transport & map symbols u"\U0001F1E0-\U0001F1FF" # flags (iOS) u"\U00002500-\U00002BEF" # chinese char u"\U00002702-\U000027B0" u"\U00002702-\U000027B0" u"\U000024C2-\U0001F251" u"\U0001f926-\U0001f937" u"\U00010000-\U0010ffff" u"\u2640-\u2642" u"\u2600-\u2B55" u"\u200d" u"\u23cf" u"\u23e9" u"\u231a" u"\ufe0f" # dingbats u"\u3030" "]+", flags=re.UNICODE) text = emoji_pattern.sub(r"", text) #delte Link hashtag and mention text = re.sub(r"(?:@\S*|#\S*|http(?=.*://)\S*)", "", text) text = re.sub(r"^https://t.co/[A-Za-z0-9]*\s", "", text) text = re.sub(r"\s+https://t.co/[a-zA-Z0-9]*\s", "", text) text = re.sub(r"\s+https://t.co/[a-zA-Z0-9]*$", "", text) #find and delete laugh laugh_count = len(re.findall(r'(5)\1{2,}(6?){3,}', text)) text = re.sub(r'(5)\1{2,}(6?){3,}', '', text) #delete symbol text = re.sub(r'[!-@[-`{-~]', "", text) #text = re.sub("\d+", "", text) #number text = normalize(text) #Tokenization tokens = word_tokenize(text) #deletion of whitespace & one letter text i = 0 for token in list(tokens): if (len(token) == 1 or len(token) == token.count(token[0]) or token in ['xxrep', 'xxwrep', '', 'ชา', 'นนท์', 'ปอนด์', 'ป้อม']): tokens.pop(i) i = i - 1 i = i + 1 #Add thailaugh posemoji negemoji tag for a in range(laugh_count): tokens.append('thailaugh') for a in range(pos_count): tokens.append('posemoji') for a in range(neg_count): tokens.append('negemoji') # POS Tag # from pythainlp.tag import pos_tag # pos = pos_tag(tokens,corpus='orchid_ud') # keep_tag = ['VERB', 'ADJ', 'ADV', 'INTJ', 'AUX'] #keep_tag = ['VACT','VATT','ADVN','ADVI','ADVP','ADVS','FIXV','NEG','ADJ',''] # pos_tags = [t[0] for t in pos if (t[1] in keep_tag) or (t[0] == "thailaugh") # or (t[0] == "posemoji") or (t[0] == "negemoji")] # tokens = pos_tags # Delete Stop Word filtered_sentence = [] for t in tokens: if t not in stop_words: #t = ''.join(c[0] for c in itertools.groupby(t)) filtered_sentence.append(t) clean_word.append(','.join(filtered_sentence)) return clean_word
def test_normalize(self): self.assertIsNotNone(normalize("พรรค์จันทร์ab์")) # normalize sara e + sara e self.assertEqual(normalize("เเปลก"), "แปลก") # normalize consonant + nikhahit + sara aa self.assertEqual(normalize("นํา"), "นำ") self.assertEqual(normalize("\u0e01\u0e4d\u0e32"), "\u0e01\u0e33") # normalize consonant + tone mark + nikhahit + sara aa self.assertEqual(normalize("\u0e01\u0e48\u0e4d\u0e32"), "\u0e01\u0e48\u0e33") # reorder consonant + follow vowel + tone mark self.assertEqual(normalize("\u0e01\u0e30\u0e48"), "\u0e01\u0e48\u0e30") # reorder consonant + nikhahit + tone mark + sara aa self.assertEqual(normalize("\u0e01\u0e4d\u0e48\u0e32"), "\u0e01\u0e48\u0e33") # reorder consonant + follow vowel + tone mark self.assertEqual(normalize("\u0e01\u0e32\u0e48"), "\u0e01\u0e48\u0e32") # remove repeating following vowels self.assertEqual(normalize("กาา"), "กา") self.assertEqual(normalize("กา า า า"), "กา") self.assertEqual(normalize("กา าาะา"), "กาะา") # remove epeating tone marks self.assertEqual(normalize("\u0e01\u0e48\u0e48"), "\u0e01\u0e48") # remove repeating different ton emarks self.assertEqual(normalize("\u0e01\u0e48\u0e49"), "\u0e01\u0e49") self.assertEqual(normalize("\u0e01\u0e48\u0e49\u0e48\u0e49"), "\u0e01\u0e49") # remove tone mark at the beginning of text self.assertEqual(remove_dangling("\u0e48\u0e01"), "\u0e01") self.assertEqual(remove_dangling("\u0e48\u0e48\u0e01"), "\u0e01") self.assertEqual(remove_dangling("\u0e48\u0e49\u0e01"), "\u0e01") self.assertEqual(remove_dangling("\u0e48\u0e01\u0e48"), "\u0e01\u0e48") # remove duplicate spaces self.assertEqual(remove_dup_spaces(" ab c d "), "ab c d") self.assertEqual(remove_dup_spaces("\nab c \n d \n"), "ab c\nd") # remove tone marks self.assertEqual(remove_tonemark("จิ้น"), "จิน") self.assertEqual(remove_tonemark("เก๋า"), "เกา") self.assertEqual(delete_tone("เจ๋งเป้ง"), remove_tonemark("เจ๋งเป้ง")) with self.assertWarns(DeprecationWarning): delete_tone("ค้าบ") # remove zero width chars self.assertEqual(remove_zw("กา\u200b"), "กา") self.assertEqual(remove_zw("ก\u200cา"), "กา") self.assertEqual(remove_zw("\u200bกา"), "กา") self.assertEqual(remove_zw("กา\u200b\u200c\u200b"), "กา")
def normalize_word(text): return normalize(text)