def __init__( self, custom_dict: List[Tuple[str, int]] = None, min_freq: int = 2, min_len: int = 2, max_len: int = 40, dict_filter: Callable[[str], bool] = _is_thai_and_not_num, ): """ Initialize Peter Norvig's spell checker object :param str custom_dict: A list of tuple (word, frequency) to create a spelling dictionary. Default is from Thai National Corpus (around 40,000 words). :param int min_freq: Minimum frequency of a word to keep (default = 2) :param int min_len: Minimum length (in characters) of a word to keep (default = 2) :param int max_len: Maximum length (in characters) of a word to keep (default = 40) :param func dict_filter: A function to filter the dictionary. Default filter removes any word with number or non-Thai characters. If no filter is required, use None. """ if not custom_dict: # default, use Thai National Corpus custom_dict = tnc.word_freqs() if not dict_filter: dict_filter = _no_filter # filter word list custom_dict = [ word_freq for word_freq in custom_dict if _keep(word_freq, min_freq, min_len, max_len, dict_filter) ] self.__WORDS = Counter(dict(custom_dict)) self.__WORDS_TOTAL = sum(self.__WORDS.values()) if self.__WORDS_TOTAL < 1: self.__WORDS_TOTAL = 0
def __init__( self, custom_dict: Union[Dict[str, int], Iterable[str], Iterable[Tuple[str, int]]] = None, min_freq: int = 2, min_len: int = 2, max_len: int = 40, dict_filter: Optional[Callable[[str], bool]] = _is_thai_and_not_num, ): """ Initializes Peter Norvig's spell checker object. Spelling dictionary can be customized. By default, spelling dictionary is from `Thai National Corpus <http://www.arts.chula.ac.th/ling/tnc/>`_ Basically, Norvig's spell checker will choose the most likely spelling correction give a word by searching for candidate corrected words based on edit distance. Then, it selects the candidate with the highest word occurrence probability. :param str custom_dict: A custom spelling dictionary. This can be: (1) a dictionary (`dict`), with words (`str`) as keys and frequencies (`int`) as values; (2) an iterable (list, tuple, or set) of word (`str`) and frequency (`int`) tuples: `(str, int)`; or (3) an iterable of just words (`str`), without frequencies -- in this case `1` will be assigned to every words. Default is from Thai National Corpus (around 40,000 words). :param int min_freq: Minimum frequency of a word to keep (default = 2) :param int min_len: Minimum length (in characters) of a word to keep (default = 2) :param int max_len: Maximum length (in characters) of a word to keep (default = 40) :param func dict_filter: A function to filter the dictionary. Default filter removes any word with number or non-Thai characters. If no filter is required, use None. """ if not custom_dict: # default, use Thai National Corpus custom_dict = tnc.word_freqs() if not dict_filter: dict_filter = _no_filter custom_dict = _convert_custom_dict(custom_dict, min_freq, min_len, max_len, dict_filter) self.__WORDS = Counter(dict(custom_dict)) self.__WORDS += Counter() # remove zero and negative counts self.__WORDS_TOTAL = sum(self.__WORDS.values())
def __init__( self, custom_dict: List[Tuple[str, int]] = None, min_freq: int = 2, min_len: int = 2, max_len: int = 40, dict_filter: Optional[Callable[[str], bool]] = _is_thai_and_not_num, ): """ Initializes Peter Norvig's spell checker object. Spelling dictionary can be customized. By default, spelling dictionary is from `Thai National Corpus <http://www.arts.chula.ac.th/ling/tnc/>`_ Basically, Norvig's spell checker will choose the most likely spelling correction give a word by searching for candidate corrected words based on edit distance. Then, it selects the candidate with the highest word occurrence probability. :param str custom_dict: A list of tuple (word, frequency) to create a spelling dictionary. Default is from Thai National Corpus (around 40,000 words). :param int min_freq: Minimum frequency of a word to keep (default = 2) :param int min_len: Minimum length (in characters) of a word to keep (default = 2) :param int max_len: Maximum length (in characters) of a word to keep (default = 40) :param func dict_filter: A function to filter the dictionary. Default filter removes any word with number or non-Thai characters. If no filter is required, use None. """ if not custom_dict: # default, use Thai National Corpus custom_dict = tnc.word_freqs() if not dict_filter: dict_filter = _no_filter # filter word list custom_dict = [ word_freq for word_freq in custom_dict if _keep(word_freq, min_freq, min_len, max_len, dict_filter) ] self.__WORDS = Counter(dict(custom_dict)) self.__WORDS_TOTAL = sum(self.__WORDS.values()) if self.__WORDS_TOTAL < 1: self.__WORDS_TOTAL = 0
def Tokenize_word(self,text): ######## Thai word segment ######## ver1 '''sent = text[0].replace("'","") word = word_tokenize(sent, engine='deepcut') # use this method wword = [x.replace('.',' ').replace('=',' ').replace('-',' ').replace("("," ").replace(")"," ").replace("/"," ").replace('สำหรับ',' ').replace('%',' ').strip(' ') for x in word] words =[] for w in wword: if w not in common.thai_stopwords(): words = [str for str in words if str] words.append(w) return words''' ######## Thai word segment ######## ver2 -> stopwords, type of words, check spell(Eng & Thai) sent = text[0].replace("'","") word = word_tokenize(sent, engine='deepcut') # use this method #wword = [x.replace('=',' ').replace('-',' ').replace("("," ").replace(")"," ").replace("/"," ").strip(' ') for x in word] th_no_stopwords =[] all_no_stopwords =[] th_correct_words =[] eng_correct_words =[] mix_correct_words =[] mix1_correct_words =[] all_correct_words =[] all_correct_words_final =[] check_thai_list = [] #for tw in wword: for tw in word: if tw not in common.thai_stopwords(): th_no_stopwords = [str for str in th_no_stopwords if str] th_no_stopwords.append(tw) #print("th_no_stopwords = ", th_no_stopwords) for ew in th_no_stopwords: if ew not in stopwords.words('english'): all_no_stopwords = [str for str in all_no_stopwords if str] all_no_stopwords.append(ew) #print("all_no_stopwords = ", all_no_stopwords) for c in all_no_stopwords: thai = isthai(c) number = c.isnumeric() if not thai: no_num = c.isalpha() match1 = re.findall('\D', c) #Return ถ้าไม่พบตัวเลข 0-9 ใน string if no_num: spell = SpellChecker() eng_correct = spell.correction(c) #pn eng_correct_words.append(eng_correct) #print("eng = ", eng_correct) elif match1: mix = c mix_correct_words.append(mix) #print("mix = ", mix) else: num = c #No return #print("num = ", num) elif thai: checker = NorvigSpellChecker(custom_dict=tnc.word_freqs()) #pn th_correct = checker.correct(c) th_correct_words.append(th_correct) #print("thai = ", th_correct) all_correct_words = th_correct_words + eng_correct_words + mix_correct_words all_correct_words = [x.replace('น.','').replace(':',' ').replace('=',' ').replace('–',' ').replace("("," ").replace(")"," ").replace("/"," ").strip(" ") for x in all_correct_words] all_correct_words_final = list(filter(None, all_correct_words)) #print("words = ", all_correct_words_final) return all_correct_words_final ######## Eng word segment ######## '''word = text[0]
def test_tnc(self): self.assertIsNotNone(tnc.word_freqs()) self.assertIsNotNone(tnc.unigram_word_freqs()) self.assertIsNotNone(tnc.bigram_word_freqs()) self.assertIsNotNone(tnc.trigram_word_freqs())
def test_tnc(self): self.assertIsNotNone(tnc.word_freqs())
def test_tnc(self): self.assertIsNotNone(tnc.word_freqs()) self.assertIsNotNone(tnc.word_freq("นก"))