def correct(self, text, include_symbol=True): """ most probable spelling correction for text :param text: input query :param include_symbol: True, default :return: corrected_text, details [(wrong_word, right_word, begin_idx, end_idx), ...] example: cann you speling it? [['cann', 'can'], ['speling', 'spelling']] """ self.check_init() text_new = '' details = [] blocks = split_2_short_text(text, include_symbol=include_symbol) for w, idx in blocks: # 大于1个字符的英文词 if len(w) > 1 and is_alphabet_string(w): if w in self.custom_confusion: corrected_item = self.custom_confusion[w] else: corrected_item = self.correct_word(w) if corrected_item != w: begin_idx = idx end_idx = idx + len(w) detail_word = (w, corrected_item, begin_idx, end_idx) details.append(detail_word) w = corrected_item text_new += w # 以begin_idx排序 details = sorted(details, key=operator.itemgetter(2)) return text_new, details
def is_filter_token(token): result = False # pass blank if not token.strip(): result = True # pass num if token.isdigit(): result = True # pass alpha if is_alphabet_string(token.lower()): result = True # pass not chinese if not is_chinese_string(token): result = True return result
def is_filter_token(token): result = False # pass blank if not token.strip(): result = True # pass punctuation if token in PUNCTUATION_LIST: result = True # pass num if token.isdigit(): result = True # pass alpha if is_alphabet_string(token.lower()): result = True return result
def detect(self, sentence): """ 检测句子中的疑似错误信息,包括[词、位置、错误类型] :param sentence: :return: [error_word, begin_pos, end_pos, error_type] """ maybe_errors = [] if not sentence.strip(): return maybe_errors self.check_detector_initialized() # 文本归一化 sentence = uniform(sentence) # 切词 tokens = self.tokenizer.tokenize(sentence) # print(tokens) # 自定义混淆集加入疑似错误词典 for confuse in self.custom_confusion: idx = sentence.find(confuse) if idx > -1: maybe_err = [ confuse, idx, idx + len(confuse), error_type["confusion"] ] self._add_maybe_error_item(maybe_err, maybe_errors) if self.is_word_error_detect: # 未登录词加入疑似错误词典 for word, begin_idx, end_idx in tokens: # pass blank if not word.strip(): continue # punctuation if word in PUNCTUATION_LIST: continue # pass num if word.isdigit(): continue # pass alpha if is_alphabet_string(word.lower()): continue # in dict if word in self.word_freq: continue maybe_err = [word, begin_idx, end_idx, error_type["word"]] self._add_maybe_error_item(maybe_err, maybe_errors) if self.is_char_error_detect: # 语言模型检测疑似错误字 ngram_avg_scores = [] try: for n in [2, 3]: scores = [] for i in range(len(sentence) - n + 1): word = sentence[i:i + n] score = self.ngram_score(list(word)) scores.append(score) if not scores: continue # 移动窗口补全得分 for _ in range(n - 1): scores.insert(0, scores[0]) scores.append(scores[-1]) avg_scores = [ sum(scores[i:i + n]) / len(scores[i:i + n]) for i in range(len(sentence)) ] ngram_avg_scores.append(avg_scores) # 取拼接后的ngram平均得分 sent_scores = list( np.average(np.array(ngram_avg_scores), axis=0)) # 取疑似错字信息 for i in self._get_maybe_error_index(sent_scores): maybe_err = [sentence[i], i, i + 1, error_type["char"]] self._add_maybe_error_item(maybe_err, maybe_errors) except IndexError as ie: logger.warn("index error, sentence:" + sentence + str(ie)) except Exception as e: logger.warn("detect error, sentence:" + sentence + str(e)) return sorted(maybe_errors, key=lambda k: k[1], reverse=False)