def _doc2features(doc, i) -> dict: word = doc[i][0] postag = doc[i][1] # Features from current word features = { "word.word": word, "word.stopword": _is_stopword(word), "word.isthai": isthai(word), "word.isspace": word.isspace(), "postag": postag, "word.isdigit": word.isdigit(), } if word.isdigit() and len(word) == 5: features["word.islen5"] = True # Features from previous word if i > 0: prevword = doc[i - 1][0] prevpostag = doc[i - 1][1] prev_features = { "word.prevword": prevword, "word.previsspace": prevword.isspace(), "word.previsthai": isthai(prevword), "word.prevstopword": _is_stopword(prevword), "word.prevpostag": prevpostag, "word.prevwordisdigit": prevword.isdigit(), } features.update(prev_features) else: features["BOS"] = True # Special "Beginning of Sequence" tag # Features from next word if i < len(doc) - 1: nextword = doc[i + 1][0] nextpostag = doc[i + 1][1] next_features = { "word.nextword": nextword, "word.nextisspace": nextword.isspace(), "word.nextpostag": nextpostag, "word.nextisthai": isthai(nextword), "word.nextstopword": _is_stopword(nextword), "word.nextwordisdigit": nextword.isdigit(), } features.update(next_features) else: features["EOS"] = True # Special "End of Sequence" tag return features
def test_isthai(self): self.assertEqual(isthai("ไทย"), True) self.assertEqual(isthai("ไทย0"), False) self.assertEqual(isthai("ต.ค."), True) self.assertEqual(isthai("(ต.ค.)"), False) self.assertEqual(isthai("ต.ค.", ignore_chars=None), False) self.assertEqual(isthai("(ต.ค.)", ignore_chars=".()"), True)
def extract_vocabs(file): words = dict() with open(file, "r") as fh: for l in fh: l = re.sub(benchmark.TAG_RX, "", l.strip()) for w in l.split("|"): if not util.isthai(w): continue if w in words: words[w] += 1 else: words[w] = 1 print(f"File: {file}") print(f" no. vocabs: {len(words)}") return words
def process_corpus(corpus, number_token="<NUM>", oov_token="<OOV>"): # Create an empty dictionary and token list dictionary = {oov_token: 1} tokenized_corpus = [] corpus = corpus[1:1000] for entry in corpus: # Normalize the entry entry = normalize(entry) # Tokenize each entry tokens = np.array( tokenize.word_tokenize(entry, engine='newmm', keep_whitespace=False)) # Remove non-Thai words tokens = tokens[[ isthai(t, ignore_chars="0123456789") and t != "" for t in tokens ]] # Replace numbers with text #tokens = [re.sub("^\d*$",num2words(t, lang = 'th'),t) for t in tokens] # Add the tokens to the tokenized corpus tokenized_corpus.append(tokens) # Add the tokens to the dictionary and increment counts for t in tokens: if t in dictionary: dictionary[t] = dictionary[t] + 1 else: dictionary[t] = 1 return tokenized_corpus, dictionary
def Tokenize_word(self,text): ######## Thai word segment ######## ver1 '''sent = text[0].replace("'","") word = word_tokenize(sent, engine='deepcut') # use this method wword = [x.replace('.',' ').replace('=',' ').replace('-',' ').replace("("," ").replace(")"," ").replace("/"," ").replace('สำหรับ',' ').replace('%',' ').strip(' ') for x in word] words =[] for w in wword: if w not in common.thai_stopwords(): words = [str for str in words if str] words.append(w) return words''' ######## Thai word segment ######## ver2 -> stopwords, type of words, check spell(Eng & Thai) sent = text[0].replace("'","") word = word_tokenize(sent, engine='deepcut') # use this method #wword = [x.replace('=',' ').replace('-',' ').replace("("," ").replace(")"," ").replace("/"," ").strip(' ') for x in word] th_no_stopwords =[] all_no_stopwords =[] th_correct_words =[] eng_correct_words =[] mix_correct_words =[] mix1_correct_words =[] all_correct_words =[] all_correct_words_final =[] check_thai_list = [] #for tw in wword: for tw in word: if tw not in common.thai_stopwords(): th_no_stopwords = [str for str in th_no_stopwords if str] th_no_stopwords.append(tw) #print("th_no_stopwords = ", th_no_stopwords) for ew in th_no_stopwords: if ew not in stopwords.words('english'): all_no_stopwords = [str for str in all_no_stopwords if str] all_no_stopwords.append(ew) #print("all_no_stopwords = ", all_no_stopwords) for c in all_no_stopwords: thai = isthai(c) number = c.isnumeric() if not thai: no_num = c.isalpha() match1 = re.findall('\D', c) #Return ถ้าไม่พบตัวเลข 0-9 ใน string if no_num: spell = SpellChecker() eng_correct = spell.correction(c) #pn eng_correct_words.append(eng_correct) #print("eng = ", eng_correct) elif match1: mix = c mix_correct_words.append(mix) #print("mix = ", mix) else: num = c #No return #print("num = ", num) elif thai: checker = NorvigSpellChecker(custom_dict=tnc.word_freqs()) #pn th_correct = checker.correct(c) th_correct_words.append(th_correct) #print("thai = ", th_correct) all_correct_words = th_correct_words + eng_correct_words + mix_correct_words all_correct_words = [x.replace('น.','').replace(':',' ').replace('=',' ').replace('–',' ').replace("("," ").replace(")"," ").replace("/"," ").strip(" ") for x in all_correct_words] all_correct_words_final = list(filter(None, all_correct_words)) #print("words = ", all_correct_words_final) return all_correct_words_final ######## Eng word segment ######## '''word = text[0]
def Tokenize_word(self,text): ######## Thai word segment ######## ''''sent = text[0].replace("'","") word = word_tokenize(sent, engine='deepcut') # use this method wword = [x.replace('.',' ').replace('=',' ').replace('-',' ').replace("("," ").replace(")"," ").replace("/"," ").replace('สำหรับ',' ').replace('%',' ').strip(' ') for x in word] words =[] for w in wword: if w not in common.thai_stopwords(): words = [str for str in words if str] words.append(w) return words''' ######## Thai word segment ######## ver.2 -> stopwords, type of words sent = text[0].replace("'","") word = word_tokenize(sent, engine='deepcut') # use this method wword = [x.replace('.',' ').replace('%',' ').replace('=',' ').replace('-',' ').replace("("," ").replace(")"," ").replace("/"," ").strip(' ') for x in word] th_no_stopwords =[] eng_no_stopwords =[] th_correct_words =[] eng_correct_words =[] mix_correct_words =[] mix1_correct_words =[] all_correct_words =[] all_correct_words_final =[] check_thai_list = [] for w in wword: thai = isthai(w) #number = c.isnumeric() if thai: if w not in common.thai_stopwords(): #th_no_stopwords = [str for str in th_no_stopwords if str] th_no_stopwords.append(w) #print("thai = ", th_correct) elif not thai: if w not in stopwords.words('english'): #eng_no_stopwords = [str for str in eng_no_stopwords if str] #eng_no_stopwords.append(w) no_num = w.isalpha() match1 = re.findall('\D', w) #Return ถ้าไม่พบตัวเลข 0-9 ใน string if no_num: eng = w eng_no_stopwords.append(eng) #print("eng = ", eng_correct) elif match1: mix = w mix_correct_words.append(mix) #print("mix = ", mix) else: num = w #No return #print("num = ", num) #print("th_correct_words = ", th_correct_words) #print("eng_correct_stopwords = ", eng_correct_words) all_correct_words = th_no_stopwords + eng_no_stopwords + mix_correct_words all_correct_words = [x.replace('น.','').replace(':',' ').replace('=',' ').replace('–',' ').replace("("," ").replace(")"," ").replace("/"," ").strip(" ") for x in all_correct_words] all_correct_words_final = list(filter(None, all_correct_words)) #print("words = ", all_correct_words) return all_correct_words_final ######## Eng word segment ######## '''word = text[0]
def main_nlp(self, datas): """ datas have to be list or something can for loop and got string that is content. """ output_list = [] out_STR = "" # --------------------------Filter all thing and return list of word "usefull"-------------------------------- try: # ตัด # pattern = re.compile(r"(#+[a-zA-Z0-9(_)|ก-๙(_)0-9]{1,})") out_str_hashtags = pattern.sub("", datas) # ตัด @ pattern = re.compile(r"(@+[a-zA-Z0-9(_)|ก-๙(_)0-9]{1,})") out_str_add = pattern.sub("", out_str_hashtags) # ตัด emoji str_output = emoji.get_emoji_regexp().sub(u'', out_str_add) # ตัด link pattern = re.compile(r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))") out_str_link = pattern.sub("", str_output) # ตัด ตัดอักษร+ตัวเลข pattern = re.compile(r"([A-Za-z-_]+[\d]+[\w]*|[\d]+[A-Za-z-_]+[\w]*)") out_str_number = pattern.sub("", out_str_link) # ตัด ตัวเลข pattern = re.compile(r"([๑-๙(_)0-9]{1,})") out_str_ = pattern.sub("", out_str_number) out_STR += out_str_ # หา # pattern = re.compile(r"(?P<out_list>#+[a-zA-Z0-9(_)|ก-๙(_)0-9]{1,})") output_list += re.findall(pattern, datas) # หา @ pattern = re.compile(r"(?P<out_list>@+[a-zA-Z0-9(_)|ก-๙(_)0-9]{1,})") output_list += re.findall(pattern, datas) # หา ตัดอักษร+ตัวเลข pattern = re.compile(r"(?P<out_list>[A-Za-z-_]+[\d]+[\w]*|[\d]+[A-Za-z-_]+[\w]*)") output_list += re.findall(pattern, datas) except AttributeError: pass # --------------------------Filter all thing and return list of word "usefull"-------------------------------- #proc = deepcut.tokenize(out_STR) proc = word_tokenize(out_STR, engine="newmm", keep_whitespace=False, custom_dict=self.custom_tokenizer) for i in proc: # ----------------special symbol------------------ special = re.compile(r"\W+").sub("",i) # special symbol if(special == "" or i.lower() == "https" or i.lower() == "http"): continue # ------------------------------------------------ # -------------- stop word thai and english -------------- if(isthai(i)): if( i not in self.stopwords_thai ): output_list.append(i) elif(i.isascii()): if( i.lower() not in self.STOP_WORD_1 and i.lower() not in self.STOP_WORD_2 and i.lower() not in self.STOP_WORD_3 ): output_list.append(i) # -------------------------------------------------------- return output_list
def isThaiWord(word): return isthai(word)