Beispiel #1
0
    def __init__(
        self,
        custom_dict: List[Tuple[str, int]] = None,
        min_freq: int = 2,
        min_len: int = 2,
        max_len: int = 40,
        dict_filter: Callable[[str], bool] = _is_thai_and_not_num,
    ):
        """
        Initialize Peter Norvig's spell checker object

        :param str custom_dict: A list of tuple (word, frequency) to create a spelling dictionary. Default is from Thai National Corpus (around 40,000 words).
        :param int min_freq: Minimum frequency of a word to keep (default = 2)
        :param int min_len: Minimum length (in characters) of a word to keep (default = 2)
        :param int max_len: Maximum length (in characters) of a word to keep (default = 40)
        :param func dict_filter: A function to filter the dictionary. Default filter removes any word with number or non-Thai characters. If no filter is required, use None.
        """
        if not custom_dict:  # default, use Thai National Corpus
            custom_dict = tnc.word_freqs()

        if not dict_filter:
            dict_filter = _no_filter

        # filter word list
        custom_dict = [
            word_freq for word_freq in custom_dict
            if _keep(word_freq, min_freq, min_len, max_len, dict_filter)
        ]

        self.__WORDS = Counter(dict(custom_dict))
        self.__WORDS_TOTAL = sum(self.__WORDS.values())
        if self.__WORDS_TOTAL < 1:
            self.__WORDS_TOTAL = 0
Beispiel #2
0
    def __init__(
        self,
        custom_dict: List[Tuple[str, int]] = None,
        min_freq: int = 2,
        min_len: int = 2,
        max_len: int = 40,
        dict_filter: Callable[[str], bool] = _is_thai_and_not_num,
    ):
        """
        Initialize Peter Norvig's spell checker object

        :param str custom_dict: A list of tuple (word, frequency) to create a spelling dictionary. Default is from Thai National Corpus (around 40,000 words).
        :param int min_freq: Minimum frequency of a word to keep (default = 2)
        :param int min_len: Minimum length (in characters) of a word to keep (default = 2)
        :param int max_len: Maximum length (in characters) of a word to keep (default = 40)
        :param func dict_filter: A function to filter the dictionary. Default filter removes any word with number or non-Thai characters. If no filter is required, use None.
        """
        if not custom_dict:  # default, use Thai National Corpus
            custom_dict = tnc.word_freqs()

        if not dict_filter:
            dict_filter = _no_filter

        # filter word list
        custom_dict = [
            word_freq
            for word_freq in custom_dict
            if _keep(word_freq, min_freq, min_len, max_len, dict_filter)
        ]

        self.__WORDS = Counter(dict(custom_dict))
        self.__WORDS_TOTAL = sum(self.__WORDS.values())
        if self.__WORDS_TOTAL < 1:
            self.__WORDS_TOTAL = 0
Beispiel #3
0
    def __init__(
        self,
        custom_dict: Union[Dict[str, int], Iterable[str],
                           Iterable[Tuple[str, int]]] = None,
        min_freq: int = 2,
        min_len: int = 2,
        max_len: int = 40,
        dict_filter: Optional[Callable[[str], bool]] = _is_thai_and_not_num,
    ):
        """
        Initializes Peter Norvig's spell checker object.
        Spelling dictionary can be customized.
        By default, spelling dictionary is from
        `Thai National Corpus <http://www.arts.chula.ac.th/ling/tnc/>`_

        Basically, Norvig's spell checker will choose the most likely
        spelling correction give a word by searching for candidate
        corrected words based on edit distance.
        Then, it selects the candidate with
        the highest word occurrence probability.

        :param str custom_dict: A custom spelling dictionary. This can be:
                                (1) a dictionary (`dict`), with words (`str`)
                                    as keys and frequencies (`int`) as values;
                                (2) an iterable (list, tuple, or set) of word
                                    (`str`) and frequency (`int`) tuples:
                                    `(str, int)`; or
                                (3) an iterable of just words (`str`), without
                                    frequencies -- in this case `1` will be
                                    assigned to every words.
                                Default is from Thai National Corpus (around
                                40,000 words).
        :param int min_freq: Minimum frequency of a word to keep (default = 2)
        :param int min_len: Minimum length (in characters) of a word to keep
                            (default = 2)
        :param int max_len: Maximum length (in characters) of a word to keep
                            (default = 40)
        :param func dict_filter: A function to filter the dictionary.
                                 Default filter removes any word
                                 with number or non-Thai characters.
                                 If no filter is required, use None.
        """
        if not custom_dict:  # default, use Thai National Corpus
            custom_dict = tnc.word_freqs()

        if not dict_filter:
            dict_filter = _no_filter

        custom_dict = _convert_custom_dict(custom_dict, min_freq, min_len,
                                           max_len, dict_filter)

        self.__WORDS = Counter(dict(custom_dict))
        self.__WORDS += Counter()  # remove zero and negative counts
        self.__WORDS_TOTAL = sum(self.__WORDS.values())
Beispiel #4
0
    def __init__(
        self,
        custom_dict: List[Tuple[str, int]] = None,
        min_freq: int = 2,
        min_len: int = 2,
        max_len: int = 40,
        dict_filter: Optional[Callable[[str], bool]] = _is_thai_and_not_num,
    ):
        """
        Initializes Peter Norvig's spell checker object.
        Spelling dictionary can be customized.
        By default, spelling dictionary is from
        `Thai National Corpus <http://www.arts.chula.ac.th/ling/tnc/>`_

        Basically, Norvig's spell checker will choose the most likely
        spelling correction give a word by searching for candidate
        corrected words based on edit distance.
        Then, it selects the candidate with
        the highest word occurrence probability.

        :param str custom_dict: A list of tuple (word, frequency) to create
                                a spelling dictionary. Default is from
                                Thai National Corpus (around 40,000 words).
        :param int min_freq: Minimum frequency of a word to keep (default = 2)
        :param int min_len: Minimum length (in characters) of a word to keep
                            (default = 2)
        :param int max_len: Maximum length (in characters) of a word to keep
                            (default = 40)
        :param func dict_filter: A function to filter the dictionary.
                                 Default filter removes any word
                                 with number or non-Thai characters.
                                 If no filter is required, use None.
        """
        if not custom_dict:  # default, use Thai National Corpus
            custom_dict = tnc.word_freqs()

        if not dict_filter:
            dict_filter = _no_filter

        # filter word list
        custom_dict = [
            word_freq for word_freq in custom_dict
            if _keep(word_freq, min_freq, min_len, max_len, dict_filter)
        ]

        self.__WORDS = Counter(dict(custom_dict))
        self.__WORDS_TOTAL = sum(self.__WORDS.values())
        if self.__WORDS_TOTAL < 1:
            self.__WORDS_TOTAL = 0
Beispiel #5
0
 def Tokenize_word(self,text):
     
     ######## Thai word segment ######## ver1
     '''sent = text[0].replace("'","")
     word = word_tokenize(sent, engine='deepcut') # use this method
     wword = [x.replace('.',' ').replace('=',' ').replace('-',' ').replace("("," ").replace(")"," ").replace("/"," ").replace('สำหรับ',' ').replace('%',' ').strip(' ') for x in word]
     words =[]
     for w in wword:
         if w not in common.thai_stopwords():
             words = [str for str in words if str]
             words.append(w)
     return words'''
 
     ######## Thai word segment ######## ver2 -> stopwords, type of words, check spell(Eng & Thai)
     sent = text[0].replace("'","")    
     word = word_tokenize(sent, engine='deepcut') # use this method
     #wword = [x.replace('=',' ').replace('-',' ').replace("("," ").replace(")"," ").replace("/"," ").strip(' ') for x in word]
     th_no_stopwords =[]
     all_no_stopwords =[]
     th_correct_words =[]
     eng_correct_words =[]
     mix_correct_words =[]
     mix1_correct_words =[]
     all_correct_words =[]
     all_correct_words_final =[]
     check_thai_list = []
     #for tw in wword:
     for tw in word:
         if tw not in common.thai_stopwords():
             th_no_stopwords = [str for str in th_no_stopwords if str]
             th_no_stopwords.append(tw)
     #print("th_no_stopwords = ", th_no_stopwords)
     for ew in th_no_stopwords:
         if ew not in stopwords.words('english'):
             all_no_stopwords = [str for str in all_no_stopwords if str]        
             all_no_stopwords.append(ew)
     #print("all_no_stopwords = ", all_no_stopwords)
     for c in all_no_stopwords:
         thai = isthai(c)
         number = c.isnumeric()
         if not thai:
             no_num = c.isalpha()
             match1 = re.findall('\D', c) #Return ถ้าไม่พบตัวเลข 0-9 ใน string
             if no_num:
                 spell = SpellChecker()
                 eng_correct = spell.correction(c) #pn
                 eng_correct_words.append(eng_correct)
                 #print("eng = ", eng_correct)
             elif match1:
                 mix = c
                 mix_correct_words.append(mix)
                 #print("mix = ", mix)
             else:
                 num = c #No return
                 #print("num = ", num)
         elif thai:
             checker = NorvigSpellChecker(custom_dict=tnc.word_freqs()) #pn
             th_correct = checker.correct(c)
             th_correct_words.append(th_correct)
             #print("thai = ", th_correct)
           
     all_correct_words = th_correct_words + eng_correct_words + mix_correct_words
     all_correct_words = [x.replace('น.','').replace(':',' ').replace('=',' ').replace('–',' ').replace("("," ").replace(")"," ").replace("/"," ").strip(" ") for x in all_correct_words]
     all_correct_words_final = list(filter(None, all_correct_words))
     #print("words = ", all_correct_words_final)  
     return all_correct_words_final
 
     
     ######## Eng word segment ########
     '''word = text[0]
Beispiel #6
0
 def test_tnc(self):
     self.assertIsNotNone(tnc.word_freqs())
     self.assertIsNotNone(tnc.unigram_word_freqs())
     self.assertIsNotNone(tnc.bigram_word_freqs())
     self.assertIsNotNone(tnc.trigram_word_freqs())
 def test_tnc(self):
     self.assertIsNotNone(tnc.word_freqs())
Beispiel #8
0
 def test_tnc(self):
     self.assertIsNotNone(tnc.word_freqs())
     self.assertIsNotNone(tnc.word_freq("นก"))