def _doc2features(doc, i) -> dict:
    word = doc[i][0]
    postag = doc[i][1]

    # Features from current word
    features = {
        "word.word": word,
        "word.stopword": _is_stopword(word),
        "word.isthai": isthai(word),
        "word.isspace": word.isspace(),
        "postag": postag,
        "word.isdigit": word.isdigit(),
    }
    if word.isdigit() and len(word) == 5:
        features["word.islen5"] = True

    # Features from previous word
    if i > 0:
        prevword = doc[i - 1][0]
        prevpostag = doc[i - 1][1]
        prev_features = {
            "word.prevword": prevword,
            "word.previsspace": prevword.isspace(),
            "word.previsthai": isthai(prevword),
            "word.prevstopword": _is_stopword(prevword),
            "word.prevpostag": prevpostag,
            "word.prevwordisdigit": prevword.isdigit(),
        }
        features.update(prev_features)
    else:
        features["BOS"] = True  # Special "Beginning of Sequence" tag

    # Features from next word
    if i < len(doc) - 1:
        nextword = doc[i + 1][0]
        nextpostag = doc[i + 1][1]
        next_features = {
            "word.nextword": nextword,
            "word.nextisspace": nextword.isspace(),
            "word.nextpostag": nextpostag,
            "word.nextisthai": isthai(nextword),
            "word.nextstopword": _is_stopword(nextword),
            "word.nextwordisdigit": nextword.isdigit(),
        }
        features.update(next_features)
    else:
        features["EOS"] = True  # Special "End of Sequence" tag

    return features
def _doc2features(doc, i) -> dict:
    word = doc[i][0]
    postag = doc[i][1]

    # Features from current word
    features = {
        "word.word": word,
        "word.stopword": _is_stopword(word),
        "word.isthai": isthai(word),
        "word.isspace": word.isspace(),
        "postag": postag,
        "word.isdigit": word.isdigit(),
    }
    if word.isdigit() and len(word) == 5:
        features["word.islen5"] = True

    # Features from previous word
    if i > 0:
        prevword = doc[i - 1][0]
        prevpostag = doc[i - 1][1]
        prev_features = {
            "word.prevword": prevword,
            "word.previsspace": prevword.isspace(),
            "word.previsthai": isthai(prevword),
            "word.prevstopword": _is_stopword(prevword),
            "word.prevpostag": prevpostag,
            "word.prevwordisdigit": prevword.isdigit(),
        }
        features.update(prev_features)
    else:
        features["BOS"] = True  # Special "Beginning of Sequence" tag

    # Features from next word
    if i < len(doc) - 1:
        nextword = doc[i + 1][0]
        nextpostag = doc[i + 1][1]
        next_features = {
            "word.nextword": nextword,
            "word.nextisspace": nextword.isspace(),
            "word.nextpostag": nextpostag,
            "word.nextisthai": isthai(nextword),
            "word.nextstopword": _is_stopword(nextword),
            "word.nextwordisdigit": nextword.isdigit(),
        }
        features.update(next_features)
    else:
        features["EOS"] = True  # Special "End of Sequence" tag

    return features
 def test_isthai(self):
     self.assertEqual(isthai("ไทย"), True)
     self.assertEqual(isthai("ไทย0"), False)
     self.assertEqual(isthai("ต.ค."), True)
     self.assertEqual(isthai("(ต.ค.)"), False)
     self.assertEqual(isthai("ต.ค.", ignore_chars=None), False)
     self.assertEqual(isthai("(ต.ค.)", ignore_chars=".()"), True)
Exemple #4
0
 def test_isthai(self):
     self.assertEqual(isthai("ไทย"), True)
     self.assertEqual(isthai("ไทย0"), False)
     self.assertEqual(isthai("ต.ค."), True)
     self.assertEqual(isthai("(ต.ค.)"), False)
     self.assertEqual(isthai("ต.ค.", ignore_chars=None), False)
     self.assertEqual(isthai("(ต.ค.)", ignore_chars=".()"), True)
def extract_vocabs(file):
    words = dict()
    with open(file, "r") as fh:
        for l in fh:
            l = re.sub(benchmark.TAG_RX, "", l.strip())
            for w in l.split("|"):

                if not util.isthai(w):
                    continue

                if w in words:
                    words[w] += 1
                else:
                    words[w] = 1

    print(f"File: {file}")
    print(f" no. vocabs: {len(words)}")
    return words
Exemple #6
0
def process_corpus(corpus, number_token="<NUM>", oov_token="<OOV>"):
    # Create an empty dictionary and token list
    dictionary = {oov_token: 1}
    tokenized_corpus = []

    corpus = corpus[1:1000]

    for entry in corpus:
        # Normalize the entry
        entry = normalize(entry)

        # Tokenize each entry
        tokens = np.array(
            tokenize.word_tokenize(entry,
                                   engine='newmm',
                                   keep_whitespace=False))

        # Remove non-Thai words
        tokens = tokens[[
            isthai(t, ignore_chars="0123456789") and t != "" for t in tokens
        ]]

        # Replace numbers with text
        #tokens = [re.sub("^\d*$",num2words(t, lang = 'th'),t) for t in tokens]

        # Add the tokens to the tokenized corpus
        tokenized_corpus.append(tokens)

        # Add the tokens to the dictionary and increment counts
        for t in tokens:
            if t in dictionary:
                dictionary[t] = dictionary[t] + 1
            else:
                dictionary[t] = 1

    return tokenized_corpus, dictionary
Exemple #7
0
 def Tokenize_word(self,text):
     
     ######## Thai word segment ######## ver1
     '''sent = text[0].replace("'","")
     word = word_tokenize(sent, engine='deepcut') # use this method
     wword = [x.replace('.',' ').replace('=',' ').replace('-',' ').replace("("," ").replace(")"," ").replace("/"," ").replace('สำหรับ',' ').replace('%',' ').strip(' ') for x in word]
     words =[]
     for w in wword:
         if w not in common.thai_stopwords():
             words = [str for str in words if str]
             words.append(w)
     return words'''
 
     ######## Thai word segment ######## ver2 -> stopwords, type of words, check spell(Eng & Thai)
     sent = text[0].replace("'","")    
     word = word_tokenize(sent, engine='deepcut') # use this method
     #wword = [x.replace('=',' ').replace('-',' ').replace("("," ").replace(")"," ").replace("/"," ").strip(' ') for x in word]
     th_no_stopwords =[]
     all_no_stopwords =[]
     th_correct_words =[]
     eng_correct_words =[]
     mix_correct_words =[]
     mix1_correct_words =[]
     all_correct_words =[]
     all_correct_words_final =[]
     check_thai_list = []
     #for tw in wword:
     for tw in word:
         if tw not in common.thai_stopwords():
             th_no_stopwords = [str for str in th_no_stopwords if str]
             th_no_stopwords.append(tw)
     #print("th_no_stopwords = ", th_no_stopwords)
     for ew in th_no_stopwords:
         if ew not in stopwords.words('english'):
             all_no_stopwords = [str for str in all_no_stopwords if str]        
             all_no_stopwords.append(ew)
     #print("all_no_stopwords = ", all_no_stopwords)
     for c in all_no_stopwords:
         thai = isthai(c)
         number = c.isnumeric()
         if not thai:
             no_num = c.isalpha()
             match1 = re.findall('\D', c) #Return ถ้าไม่พบตัวเลข 0-9 ใน string
             if no_num:
                 spell = SpellChecker()
                 eng_correct = spell.correction(c) #pn
                 eng_correct_words.append(eng_correct)
                 #print("eng = ", eng_correct)
             elif match1:
                 mix = c
                 mix_correct_words.append(mix)
                 #print("mix = ", mix)
             else:
                 num = c #No return
                 #print("num = ", num)
         elif thai:
             checker = NorvigSpellChecker(custom_dict=tnc.word_freqs()) #pn
             th_correct = checker.correct(c)
             th_correct_words.append(th_correct)
             #print("thai = ", th_correct)
           
     all_correct_words = th_correct_words + eng_correct_words + mix_correct_words
     all_correct_words = [x.replace('น.','').replace(':',' ').replace('=',' ').replace('–',' ').replace("("," ").replace(")"," ").replace("/"," ").strip(" ") for x in all_correct_words]
     all_correct_words_final = list(filter(None, all_correct_words))
     #print("words = ", all_correct_words_final)  
     return all_correct_words_final
 
     
     ######## Eng word segment ########
     '''word = text[0]
Exemple #8
0
    def Tokenize_word(self,text):
        
        ######## Thai word segment ########
        ''''sent = text[0].replace("'","")
        word = word_tokenize(sent, engine='deepcut') # use this method
        wword = [x.replace('.',' ').replace('=',' ').replace('-',' ').replace("("," ").replace(")"," ").replace("/"," ").replace('สำหรับ',' ').replace('%',' ').strip(' ') for x in word]
        words =[]
        for w in wword:
            if w not in common.thai_stopwords():
                words = [str for str in words if str]
                words.append(w)
        return words'''
    
        ######## Thai word segment ######## ver.2 -> stopwords, type of words
        sent = text[0].replace("'","")
        word = word_tokenize(sent, engine='deepcut') # use this method
        wword = [x.replace('.',' ').replace('%',' ').replace('=',' ').replace('-',' ').replace("("," ").replace(")"," ").replace("/"," ").strip(' ') for x in word]
        th_no_stopwords =[]
        eng_no_stopwords =[]
        th_correct_words =[]
        eng_correct_words =[]
        mix_correct_words =[]
        mix1_correct_words =[]
        all_correct_words =[]
        all_correct_words_final =[]
        check_thai_list = []
        for w in wword:
            thai = isthai(w)
            #number = c.isnumeric()
            if thai:
                if w not in common.thai_stopwords():
                    #th_no_stopwords = [str for str in th_no_stopwords if str]        
                    th_no_stopwords.append(w)
                    #print("thai = ", th_correct)
            elif not thai:
                if w not in stopwords.words('english'):
                    #eng_no_stopwords = [str for str in eng_no_stopwords if str]        
                    #eng_no_stopwords.append(w)
                    no_num = w.isalpha()
                    match1 = re.findall('\D', w) #Return ถ้าไม่พบตัวเลข 0-9 ใน string
                    if no_num:
                        eng = w
                        eng_no_stopwords.append(eng)
                        #print("eng = ", eng_correct)
                    elif match1:
                        mix = w
                        mix_correct_words.append(mix)
                        #print("mix = ", mix)
                    else:
                        num = w #No return
                        #print("num = ", num)
            

        #print("th_correct_words = ", th_correct_words)
        #print("eng_correct_stopwords = ", eng_correct_words)
        
        all_correct_words = th_no_stopwords + eng_no_stopwords + mix_correct_words
        all_correct_words = [x.replace('น.','').replace(':',' ').replace('=',' ').replace('–',' ').replace("("," ").replace(")"," ").replace("/"," ").strip(" ") for x in all_correct_words]
        all_correct_words_final = list(filter(None, all_correct_words))
        #print("words = ", all_correct_words)
        return all_correct_words_final
    
        
        ######## Eng word segment ########
        '''word = text[0]
Exemple #9
0
    def main_nlp(self, datas):
        """
            datas have to be list or something can for loop and got string that is content.
        """
        output_list = []
        out_STR = ""
        # --------------------------Filter all thing and return list of word "usefull"--------------------------------
        try:
            # ตัด #
            pattern  = re.compile(r"(#+[a-zA-Z0-9(_)|ก-๙(_)0-9]{1,})")
            out_str_hashtags = pattern.sub("", datas)

            # ตัด @
            pattern  = re.compile(r"(@+[a-zA-Z0-9(_)|ก-๙(_)0-9]{1,})")
            out_str_add = pattern.sub("", out_str_hashtags)

            # ตัด emoji
            str_output = emoji.get_emoji_regexp().sub(u'', out_str_add)

            # ตัด link
            pattern  = re.compile(r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))")
            out_str_link = pattern.sub("", str_output)  

            # ตัด ตัดอักษร+ตัวเลข
            pattern  = re.compile(r"([A-Za-z-_]+[\d]+[\w]*|[\d]+[A-Za-z-_]+[\w]*)")
            out_str_number = pattern.sub("", out_str_link)  

            # ตัด ตัวเลข
            pattern  = re.compile(r"([๑-๙(_)0-9]{1,})")
            out_str_ = pattern.sub("", out_str_number)

            out_STR += out_str_

            # หา #
            pattern  = re.compile(r"(?P<out_list>#+[a-zA-Z0-9(_)|ก-๙(_)0-9]{1,})")
            output_list += re.findall(pattern, datas)

            # หา @
            pattern  = re.compile(r"(?P<out_list>@+[a-zA-Z0-9(_)|ก-๙(_)0-9]{1,})")
            output_list += re.findall(pattern, datas)

            # หา ตัดอักษร+ตัวเลข
            pattern  = re.compile(r"(?P<out_list>[A-Za-z-_]+[\d]+[\w]*|[\d]+[A-Za-z-_]+[\w]*)")
            output_list += re.findall(pattern, datas)

        except AttributeError:
            pass
        # --------------------------Filter all thing and return list of word "usefull"--------------------------------
        #proc = deepcut.tokenize(out_STR)
        proc = word_tokenize(out_STR, engine="newmm", keep_whitespace=False, custom_dict=self.custom_tokenizer)
        for i in proc:
            # ----------------special symbol------------------
            special = re.compile(r"\W+").sub("",i) # special symbol
            if(special == "" or i.lower() == "https" or i.lower() == "http"):
                continue
            # ------------------------------------------------

            # -------------- stop word thai and english --------------
            if(isthai(i)):
                if( i not in self.stopwords_thai ):
                    output_list.append(i)
            elif(i.isascii()):
                if( i.lower() not in self.STOP_WORD_1 and i.lower() not in self.STOP_WORD_2 and i.lower() not in self.STOP_WORD_3 ):
                    output_list.append(i)
            # --------------------------------------------------------
        return output_list
Exemple #10
0
def isThaiWord(word):
    return isthai(word)