Example #1
0
 def correct(self, text, include_symbol=True):
     """
     most probable spelling correction for text
     :param text: input query
     :param include_symbol: True, default
     :return: corrected_text, details [(wrong_word, right_word, begin_idx, end_idx), ...]
     example:
     cann you speling it? [['cann', 'can'], ['speling', 'spelling']]
     """
     self.check_init()
     text_new = ''
     details = []
     blocks = split_2_short_text(text, include_symbol=include_symbol)
     for w, idx in blocks:
         # 大于1个字符的英文词
         if len(w) > 1 and is_alphabet_string(w):
             if w in self.custom_confusion:
                 corrected_item = self.custom_confusion[w]
             else:
                 corrected_item = self.correct_word(w)
             if corrected_item != w:
                 begin_idx = idx
                 end_idx = idx + len(w)
                 detail_word = (w, corrected_item, begin_idx, end_idx)
                 details.append(detail_word)
                 w = corrected_item
         text_new += w
     # 以begin_idx排序
     details = sorted(details, key=operator.itemgetter(2))
     return text_new, details
Example #2
0
 def is_filter_token(token):
     result = False
     # pass blank
     if not token.strip():
         result = True
     # pass num
     if token.isdigit():
         result = True
     # pass alpha
     if is_alphabet_string(token.lower()):
         result = True
     # pass not chinese
     if not is_chinese_string(token):
         result = True
     return result
Example #3
0
 def is_filter_token(token):
     result = False
     # pass blank
     if not token.strip():
         result = True
     # pass punctuation
     if token in PUNCTUATION_LIST:
         result = True
     # pass num
     if token.isdigit():
         result = True
     # pass alpha
     if is_alphabet_string(token.lower()):
         result = True
     return result
Example #4
0
    def detect(self, sentence):
        """
        检测句子中的疑似错误信息,包括[词、位置、错误类型]
        :param sentence:
        :return: [error_word, begin_pos, end_pos, error_type]
        """
        maybe_errors = []
        if not sentence.strip():
            return maybe_errors
        self.check_detector_initialized()
        # 文本归一化
        sentence = uniform(sentence)
        # 切词
        tokens = self.tokenizer.tokenize(sentence)
        # print(tokens)
        # 自定义混淆集加入疑似错误词典
        for confuse in self.custom_confusion:
            idx = sentence.find(confuse)
            if idx > -1:
                maybe_err = [
                    confuse, idx, idx + len(confuse), error_type["confusion"]
                ]
                self._add_maybe_error_item(maybe_err, maybe_errors)

        if self.is_word_error_detect:
            # 未登录词加入疑似错误词典
            for word, begin_idx, end_idx in tokens:
                # pass blank
                if not word.strip():
                    continue
                # punctuation
                if word in PUNCTUATION_LIST:
                    continue
                # pass num
                if word.isdigit():
                    continue
                # pass alpha
                if is_alphabet_string(word.lower()):
                    continue
                # in dict
                if word in self.word_freq:
                    continue
                maybe_err = [word, begin_idx, end_idx, error_type["word"]]
                self._add_maybe_error_item(maybe_err, maybe_errors)

        if self.is_char_error_detect:
            # 语言模型检测疑似错误字
            ngram_avg_scores = []
            try:
                for n in [2, 3]:
                    scores = []
                    for i in range(len(sentence) - n + 1):
                        word = sentence[i:i + n]
                        score = self.ngram_score(list(word))
                        scores.append(score)
                    if not scores:
                        continue
                    # 移动窗口补全得分
                    for _ in range(n - 1):
                        scores.insert(0, scores[0])
                        scores.append(scores[-1])
                    avg_scores = [
                        sum(scores[i:i + n]) / len(scores[i:i + n])
                        for i in range(len(sentence))
                    ]
                    ngram_avg_scores.append(avg_scores)

                # 取拼接后的ngram平均得分
                sent_scores = list(
                    np.average(np.array(ngram_avg_scores), axis=0))
                # 取疑似错字信息
                for i in self._get_maybe_error_index(sent_scores):
                    maybe_err = [sentence[i], i, i + 1, error_type["char"]]
                    self._add_maybe_error_item(maybe_err, maybe_errors)
            except IndexError as ie:
                logger.warn("index error, sentence:" + sentence + str(ie))
            except Exception as e:
                logger.warn("detect error, sentence:" + sentence + str(e))
        return sorted(maybe_errors, key=lambda k: k[1], reverse=False)