Esempio n. 1
0
def _correct_item(sentence, idx, item):
    """
    纠正错误,逐词处理
    :param sentence:
    :param idx:
    :param item:
    :return: corrected word 修正的词语
    """
    corrected_sent = sentence
    if not is_chinese_string(item):
        return corrected_sent, []
    # 取得所有可能正确的词
    maybe_error_items = _generate_items(item)
    if not maybe_error_items:
        return corrected_sent, []
    ids = idx.split(',')
    begin_id = int(ids[0])
    end_id = int(ids[-1]) if len(ids) > 1 else int(ids[0]) + 1
    before = sentence[:begin_id]
    after = sentence[end_id:]
    corrected_item = min(maybe_error_items,
                         key=lambda k: get_ppl_score(list(before + k + after),
                                                     mode=trigram_char))
    wrongs, rights, begin_idx, end_idx = [], [], [], []
    if corrected_item != item:
        corrected_sent = before + corrected_item + after
        # default_logger.debug('pred:', item, '=>', corrected_item)
        wrongs.append(item)
        rights.append(corrected_item)
        begin_idx.append(begin_id)
        end_idx.append(end_id)
    detail = list(zip(wrongs, rights, begin_idx, end_idx))
    return corrected_sent, detail
Esempio n. 2
0
    def correct(self, sentence=''):
        """
        句子改错
        :param sentence: 句子文本
        :return: 改正后的句子, list(wrong, right, begin_idx, end_idx)
        """
        detail = []
        maybe_errors = self.detect(sentence)
        for item, begin_idx, end_idx, err_type in maybe_errors:
            # 纠错,逐个处理
            before_sent = sentence[:begin_idx]
            after_sent = sentence[end_idx:]

            if err_type == ErrorType.char:
                # 对非中文的错字不做处理
                if not is_chinese_string(item):
                    continue
                if not self.check_vocab_has_all_token(sentence):
                    continue
                # 取得所有可能正确的字
                corrected_item = self.predict_mask_token(
                    sentence, begin_idx, end_idx)
            elif err_type == ErrorType.word:
                corrected_item = item
            else:
                print('not strand error_type')
            # output
            if corrected_item != item:
                sentence = before_sent + corrected_item + after_sent
                detail_word = [item, corrected_item, begin_idx, end_idx]
                detail.append(detail_word)
        detail = sorted(detail, key=operator.itemgetter(2))
        return sentence, detail
Esempio n. 3
0
def _correct_item(sentence, idx, item):
    """
    纠正错误,逐词处理
    :param sentence:
    :param idx:
    :param item:
    :return: corrected word 修正的词语
    """
    corrected_sent = sentence
    if not is_chinese_string(item):
        return corrected_sent, []
    # 取得所有可能正确的词
    maybe_error_items = _generate_items(item)
    if not maybe_error_items:
        return corrected_sent, []
    ids = idx.split(',')
    begin_id = int(ids[0])
    end_id = int(ids[-1]) if len(ids) > 1 else int(ids[0]) + 1
    before = sentence[:begin_id]
    after = sentence[end_id:]
    corrected_item = min(maybe_error_items,
                         key=lambda k: get_ppl_score(list(before + k + after),
                                                     mode=trigram_char))
    wrongs, rights, begin_idx, end_idx = [], [], [], []
    if corrected_item != item:
        corrected_sent = before + corrected_item + after
        # default_logger.debug('pred:', item, '=>', corrected_item)
        wrongs.append(item)
        rights.append(corrected_item)
        begin_idx.append(begin_id)
        end_idx.append(end_id)
    detail = list(zip(wrongs, rights, begin_idx, end_idx))
    return corrected_sent, detail
Esempio n. 4
0
    def generate_items(self, word, fraction=1):
        """
        生成纠错候选集
        :param word:
        :param fraction:
        :return:
        """
        candidates_1_order = []
        candidates_2_order = []
        candidates_3_order = []
        # same pinyin word
        candidates_1_order.extend(self._confusion_word_set(word))
        # custom confusion word
        candidates_1_order.extend(self._confusion_custom_set(word))
        # same pinyin char
        if len(word) == 1:
            # same one char pinyin
            confusion = [i for i in self._confusion_char_set(word[0]) if i]
            candidates_2_order.extend(confusion)
        if len(word) == 2:
            # same first char pinyin
            confusion = [
                i + word[1:] for i in self._confusion_char_set(word[0]) if i
            ]
            candidates_2_order.extend(confusion)
            # same last char pinyin
            confusion = [
                word[:-1] + i for i in self._confusion_char_set(word[-1]) if i
            ]
            candidates_2_order.extend(confusion)
        if len(word) > 2:
            # same mid char pinyin
            confusion = [
                word[0] + i + word[2:]
                for i in self._confusion_char_set(word[1])
            ]
            candidates_3_order.extend(confusion)

            # same first word pinyin
            confusion_word = [
                i + word[-1] for i in self._confusion_word_set(word[:-1])
            ]
            candidates_3_order.extend(confusion_word)

            # same last word pinyin
            confusion_word = [
                word[0] + i for i in self._confusion_word_set(word[1:])
            ]
            candidates_3_order.extend(confusion_word)

        # add all confusion word list
        confusion_word_set = set(candidates_1_order + candidates_2_order +
                                 candidates_3_order)
        confusion_word_list = [
            item for item in confusion_word_set if is_chinese_string(item)
        ]
        confusion_sorted = sorted(confusion_word_list,
                                  key=lambda k: self.word_frequency(k),
                                  reverse=True)
        return confusion_sorted[:len(confusion_word_list) // fraction + 1]
Esempio n. 5
0
 def _correct_item(self, sentence, item, begin_idx, end_idx):
     """
     纠正字词错误
     :param sentence:
     :param idx:
     :param item:
     :return: corrected word 修正的词语
     """
     corrected_sent = sentence
     detail = []
     if not is_chinese_string(item):
         return corrected_sent, detail
     # 取得所有可能正确的词
     maybe_right_items = self._generate_items(item)
     if not maybe_right_items:
         return corrected_sent, []
     before_sent = sentence[:begin_idx]
     after_sent = sentence[end_idx:]
     corrected_item = min(
         maybe_right_items,
         key=lambda k: self.ppl_score(list(before_sent + k + after_sent)))
     if corrected_item != item:
         corrected_sent = before_sent + corrected_item + after_sent
         # default_logger.debug('predict:' + item + '=>' + corrected_item)
         detail = [item, corrected_item, begin_idx, end_idx]
     return corrected_sent, detail
Esempio n. 6
0
def tokenize_words(text):
    """Word segmentation"""
    output = []
    sentences = split_2_short_text(text, include_symbol=True)
    for sentence, idx in sentences:
        if is_chinese_string(sentence):
            import jieba
            output.extend(jieba.lcut(sentence))
        else:
            output.extend(whitespace_tokenize(sentence))
    return output
Esempio n. 7
0
def _generate_items(word, fraction=1):
    candidates_1_order = []
    candidates_2_order = []
    candidates_3_order = []
    # same pinyin word
    candidates_1_order.extend(get_confusion_word_set(word))
    # same pinyin char
    if len(word) == 1:
        # same pinyin
        confusion = [i for i in get_confusion_char_set(word[0]) if i]
        candidates_2_order.extend(confusion)
    if len(word) > 1:
        # same first pinyin
        confusion = [
            i + word[1:] for i in get_confusion_char_set(word[0]) if i
        ]
        candidates_2_order.extend(confusion)
        # same last pinyin
        confusion = [
            word[:-1] + i for i in get_confusion_char_set(word[-1]) if i
        ]
        candidates_2_order.extend(confusion)
        if len(word) > 2:
            # same mid char pinyin
            confusion = [
                word[0] + i + word[2:] for i in get_confusion_char_set(word[1])
            ]
            candidates_3_order.extend(confusion)

            # same first word pinyin
            confusion_word = [
                i + word[-1] for i in get_confusion_word_set(word[:-1])
            ]
            candidates_1_order.extend(confusion_word)

            # same last word pinyin
            confusion_word = [
                word[0] + i for i in get_confusion_word_set(word[1:])
            ]
            candidates_1_order.extend(confusion_word)

    # add all confusion word list
    confusion_word_set = set(candidates_1_order + candidates_2_order +
                             candidates_3_order)
    confusion_word_list = [
        item for item in confusion_word_set if is_chinese_string(item)
    ]
    confusion_sorted = sorted(confusion_word_list, key=lambda k: \
        get_frequency(k), reverse=True)
    return confusion_sorted[:len(confusion_word_list) // fraction + 1]
Esempio n. 8
0
    def electra_correct(self, text):
        """
        句子纠错
        :param text: 句子文本
        :return: corrected_text, list[list], [error_word, correct_word, begin_pos, end_pos]
        """
        text_new = ''
        details = []
        # 编码统一,utf-8 to unicode
        text = convert_to_unicode(text)
        # 长句切分为短句
        blocks = self.split_2_short_text(text, include_symbol=True)
        for blk, start_idx in blocks:
            error_ids = self.electra_detect(blk)
            sentence_lst = list(blk)
            for idx in error_ids:
                s = sentence_lst[idx]
                if is_chinese_string(s):
                    # 处理中文错误
                    sentence_lst[idx] = self.mask
                    sentence_new = ''.join(sentence_lst)
                    # 生成器fill-mask预测[mask],默认取top5
                    predicts = self.g_model(sentence_new)
                    top_tokens = []
                    for p in predicts:
                        token_id = p.get('token', 0)
                        token_str = self.g_model.tokenizer.convert_ids_to_tokens(
                            token_id)
                        top_tokens.append(token_str)

                    if top_tokens and (s not in top_tokens):
                        # 取得所有可能正确的词
                        candidates = self.generate_items(s)
                        if candidates:
                            for token_str in top_tokens:
                                if token_str in candidates:
                                    details.append([
                                        s, token_str, start_idx + idx,
                                        start_idx + idx + 1
                                    ])
                                    sentence_lst[idx] = token_str
                                    break
                    # 还原
                    if sentence_lst[idx] == self.mask:
                        sentence_lst[idx] = s

            blk_new = ''.join(sentence_lst)
            text_new += blk_new
        details = sorted(details, key=operator.itemgetter(2))
        return text_new, details
Esempio n. 9
0
    def generate_items(self, word, fragment=1):
        """
        生成纠错候选集
        :param word:
        :param fragment: 分段
        :return:
        """
        self.check_corrector_initialized()
        # 1字
        candidates_1 = []
        # 2字
        candidates_2 = []
        # 多于2字
        candidates_3 = []

        # same pinyin word
        candidates_1.extend(self._confusion_word_set(word))
        # custom confusion word
        candidates_1.extend(self._confusion_custom_set(word))
        # same pinyin char
        if len(word) == 1:
            # same one char pinyin
            confusion = [i for i in self._confusion_char_set(word[0]) if i]
            candidates_1.extend(confusion)
        if len(word) == 2:
            # same first char pinyin
            confusion = [i + word[1:] for i in self._confusion_char_set(word[0]) if i]
            candidates_2.extend(confusion)
            # same last char pinyin
            confusion = [word[:-1] + i for i in self._confusion_char_set(word[-1]) if i]
            candidates_2.extend(confusion)
        if len(word) > 2:
            # same mid char pinyin
            confusion = [word[0] + i + word[2:] for i in self._confusion_char_set(word[1])]
            candidates_3.extend(confusion)

            # same first word pinyin
            confusion_word = [i + word[-1] for i in self._confusion_word_set(word[:-1])]
            candidates_3.extend(confusion_word)

            # same last word pinyin
            confusion_word = [word[0] + i for i in self._confusion_word_set(word[1:])]
            candidates_3.extend(confusion_word)

        # add all confusion word list
        confusion_word_set = set(candidates_1 + candidates_2 + candidates_3)
        confusion_word_list = [item for item in confusion_word_set if is_chinese_string(item)]
        confusion_sorted = sorted(confusion_word_list, key=lambda k: self.word_frequency(k), reverse=True)
        return confusion_sorted[:len(confusion_word_list) // fragment + 1]
Esempio n. 10
0
 def is_filter_token(token):
     result = False
     # pass blank
     if not token.strip():
         result = True
     # pass num
     if token.isdigit():
         result = True
     # pass alpha
     if is_alphabet_string(token.lower()):
         result = True
     # pass not chinese
     if not is_chinese_string(token):
         result = True
     return result
Esempio n. 11
0
    def bert_correct(self, text):
        """
        句子纠错
        :param text: 句子文本
        :return: corrected_text, list[list], [error_word, correct_word, begin_pos, end_pos]
        """
        text_new = ''
        details = []
        self.check_corrector_initialized()
        # 编码统一,utf-8 to unicode
        text = convert_to_unicode(text)
        # 长句切分为短句
        blocks = self.split_text_by_maxlen(text, maxlen=128)
        for blk, start_idx in blocks:
            blk_new = ''
            for idx, s in enumerate(blk):
                # 处理中文错误
                if is_chinese_string(s):
                    sentence_lst = list(blk_new + blk[idx:])
                    sentence_lst[idx] = self.mask
                    sentence_new = ''.join(sentence_lst)
                    # 预测,默认取top5
                    predicts = self.model(sentence_new)
                    top_tokens = []
                    for p in predicts:
                        token_id = p.get('token', 0)
                        token_str = self.model.tokenizer.convert_ids_to_tokens(
                            token_id)
                        top_tokens.append(token_str)

                    if top_tokens and (s not in top_tokens):
                        # 取得所有可能正确的词
                        candidates = self.generate_items(s)
                        if candidates:
                            for token_str in top_tokens:
                                if token_str in candidates:
                                    details.append([
                                        s, token_str, start_idx + idx,
                                        start_idx + idx + 1
                                    ])
                                    s = token_str
                                    break
                blk_new += s
            text_new += blk_new
        details = sorted(details, key=operator.itemgetter(2))
        return text_new, details
Esempio n. 12
0
 def correct(self, sentence):
     """
     句子改错
     :param sentence: 句子文本
     :return: 改正后的句子, list(wrong, right, begin_idx, end_idx)
     """
     detail = []
     self.check_corrector_initialized()
     # 长句切分为短句
     # sentences = re.split(r";|,|。|\?\s|;\s|,\s", sentence)
     maybe_errors = self.detect(sentence)
     # trick: 类似翻译模型,倒序处理;用end_idx排序,倒序
     maybe_errors = sorted(maybe_errors,
                           key=operator.itemgetter(2),
                           reverse=True)
     pprint('ordered maybe errors', maybe_errors)
     print('---generate items:')
     # 居然一个一个地改[抠鼻][抠鼻]
     for item, begin_idx, end_idx, err_type in maybe_errors:
         # 纠错,逐个处理
         before_sent = sentence[:begin_idx]
         after_sent = sentence[end_idx:]
         pprint('item,err_type', [item, err_type])
         # 困惑集中指定的词,直接取结果
         if err_type == error_type["confusion"]:
             corrected_item = self.custom_confusion[item]
         else:
             # 对非中文的错字不做处理
             if not is_chinese_string(item):
                 continue
             # 取得所有可能正确的词
             maybe_right_items = self.generate_items(item)
             pprint('maybe_right_items', maybe_right_items)
             if not maybe_right_items:
                 continue
             corrected_item = self.lm_correct_item(item, maybe_right_items,
                                                   before_sent, after_sent)
             pprint('corrected_item', corrected_item)
         # output
         if corrected_item != item:
             sentence = before_sent + corrected_item + after_sent
             # logger.debug('predict:' + item + '=>' + corrected_item)
             detail_word = [item, corrected_item, begin_idx, end_idx]
             detail.append(detail_word)
     detail = sorted(detail, key=operator.itemgetter(2))
     return sentence, detail
Esempio n. 13
0
    def ernie_correct(self, text, ernie_cut_type='char'):
        """
        句子纠错
        :param text: 句子文本
        :param ernie_cut_type: 切词类型(char/word)
        :return: corrected_text, list[list], [error_word, correct_word, begin_pos, end_pos]
        """
        text_new = ''
        details = []
        self.check_corrector_initialized()
        # 编码统一,utf-8 to unicode
        text = convert_to_unicode(text)
        # 长句切分为短句
        blocks = self.split_text_by_maxlen(text, maxlen=512)
        for blk, start_idx in blocks:
            blk_new = ''
            blk = segment(blk, cut_type=ernie_cut_type, pos=False)
            for idx, s in enumerate(blk):
                # 处理中文错误
                if is_chinese_string(s):
                    sentence_lst = blk[:idx] + blk[idx:]
                    sentence_lst[idx] = self.mask_token * len(s)
                    sentence_new = ' '.join(sentence_lst)
                    # 预测,默认取top5
                    predicts = self.predict_mask(sentence_new)
                    top_tokens = []
                    for p in predicts:
                        top_tokens.append(p.get('token', ''))

                    if top_tokens and (s not in top_tokens):
                        # 取得所有可能正确的词
                        candidates = self.generate_items(s)
                        if candidates:
                            for token_str in top_tokens:
                                if token_str in candidates:
                                    details.append([
                                        s, token_str, start_idx + idx,
                                        start_idx + idx + 1
                                    ])
                                    s = token_str
                                    break
                blk_new += s
            text_new += blk_new
        details = sorted(details, key=operator.itemgetter(2))
        return text_new, details
Esempio n. 14
0
    def correct(self, sentence):
        """
        句子改错
        :param sentence: 句子文本
        :return: 改正后的句子, list(wrong, right, begin_idx, end_idx)
        """
        detail = []
        self.check_corrector_initialized()
        maybe_errors = self.detect(sentence)
        # 排序
        maybe_errors = sorted(maybe_errors,
                              key=operator.itemgetter(2),
                              reverse=False)
        for cur_item, begin_idx, end_idx, err_type in maybe_errors:
            # 纠错,逐个处理
            before_sent = sentence[:begin_idx]
            after_sent = sentence[end_idx:]

            # 困惑集中指定的词,直接取结果
            if err_type == ErrorType.confusion:
                corrected_item = self.custom_confusion[cur_item]
            else:
                # 对非中文的错字不做处理
                if not is_chinese_string(cur_item):
                    continue
                # 取得所有可能正确的词
                candidates = self.generate_items(cur_item)
                if not candidates:
                    continue
                corrected_item = self.get_lm_correct_item(cur_item,
                                                          candidates,
                                                          before_sent,
                                                          after_sent,
                                                          n=5,
                                                          threshold=50)
            # output
            if corrected_item != cur_item:
                sentence = before_sent + corrected_item + after_sent
                detail_word = [cur_item, corrected_item, begin_idx, end_idx]
                detail.append(detail_word)
        detail = sorted(detail, key=operator.itemgetter(2))
        return sentence, detail
Esempio n. 15
0
    def correct(self, sentence=''):
        """
        句子改错
        :param sentence: 句子文本
        :return: 改正后的句子, list(wrong, right, begin_idx, end_idx)
        """
        detail = []
        maybe_errors = self.detect(sentence)
        maybe_errors = sorted(maybe_errors,
                              key=operator.itemgetter(2),
                              reverse=False)
        for item, begin_idx, end_idx, err_type in maybe_errors:
            # 纠错,逐个处理
            before_sent = sentence[:begin_idx]
            after_sent = sentence[end_idx:]

            # 困惑集中指定的词,直接取结果
            if err_type == error_type["confusion"]:
                corrected_item = self.custom_confusion[item]
            elif err_type == error_type["char"]:
                # 对非中文的错字不做处理
                if not is_chinese_string(item):
                    continue
                if not self.check_vocab_has_all_token(sentence):
                    continue
                # 取得所有可能正确的字
                corrected_item = self.bert_lm_infer(sentence,
                                                    error_begin_idx=begin_idx,
                                                    error_end_idx=end_idx)
            elif err_type == error_type["word"]:
                corrected_item = item
            else:
                print('not strand error_type')
            # output
            if corrected_item != item:
                sentence = before_sent + corrected_item + after_sent
                detail_word = [item, corrected_item, begin_idx, end_idx]
                detail.append(detail_word)
        detail = sorted(detail, key=operator.itemgetter(2))
        return sentence, detail
Esempio n. 16
0
    def _correct_item(self, sentence, item, begin_idx, end_idx, err_type):
        """
        纠正字词错误
        :param sentence:
        :param item:
        :param begin_idx:
        :param end_idx:
        :param err_type: 错误类型
        :return: corrected word 修正的词语
        """
        corrected_sent = sentence
        detail = []

        before_sent = sentence[:begin_idx]
        after_sent = sentence[end_idx:]

        # 困惑集中指定的词,直接取结果
        if err_type == error_type["confusion"]:
            corrected_item = self.custom_confusion[item]
        else:
            # 对非中文的错字不做处理
            if not is_chinese_string(item):
                return corrected_sent, detail
            # 取得所有可能正确的词
            maybe_right_items = self._generate_items(item)
            if not maybe_right_items:
                return corrected_sent, detail
            if item not in maybe_right_items:
                maybe_right_items.append(item)
            corrected_item = min(maybe_right_items,
                                 key=lambda k: self.ppl_score(
                                     list(before_sent + k + after_sent)))

        # output
        if corrected_item != item:
            corrected_sent = before_sent + corrected_item + after_sent
            # default_logger.debug('predict:' + item + '=>' + corrected_item)
            detail = [item, corrected_item, begin_idx, end_idx]
        return corrected_sent, detail
Esempio n. 17
0
    def correct(self, sentence):
        """
        句子改错
        :param sentence: 句子文本
        :return: 改正后的句子, list(wrong, right, begin_idx, end_idx)
        """
        detail = []
        self.check_corrector_initialized()
        maybe_errors = self.detect(sentence)
        # 倒序处理
        maybe_errors = sorted(maybe_errors,
                              key=operator.itemgetter(2),
                              reverse=True)
        for item, begin_idx, end_idx, err_type in maybe_errors:
            # 纠错,逐个处理
            before_sent = sentence[:begin_idx]
            after_sent = sentence[end_idx:]

            # 困惑集中指定的词,直接取结果
            if err_type == ErrorType.confusion:
                corrected_item = self.custom_confusion[item]
            else:
                # 对非中文的错字不做处理
                if not is_chinese_string(item):
                    continue
                # 取得所有可能正确的词
                maybe_right_items = self.generate_items(item)
                if not maybe_right_items:
                    continue
                corrected_item = self.lm_correct_item(item, maybe_right_items,
                                                      before_sent, after_sent)
            # output
            if corrected_item != item:
                sentence = before_sent + corrected_item + after_sent
                # logger.debug('predict:' + item + '=>' + corrected_item)
                detail_word = [item, corrected_item, begin_idx, end_idx]
                detail.append(detail_word)
        detail = sorted(detail, key=operator.itemgetter(2))
        return sentence, detail
Esempio n. 18
0
    def predict(self, text, **kwargs):
        details = []
        text_new = ''
        self.check_corrector_initialized()
        # 编码统一,utf-8 to unicode
        text = convert_to_unicode(text)
        # 长句切分为短句
        blocks = split_text_by_maxlen(text, maxlen=128)
        for blk, start_idx in blocks:
            blk_new = ''
            for idx, s in enumerate(blk):
                # 处理中文错误
                if is_chinese_string(s):
                    sentence_lst = list(blk_new + blk[idx:])
                    sentence_lst[idx] = self.mask
                    # 预测,默认取top10
                    predict_words = self.predict_mask_token(sentence_lst,
                                                            idx,
                                                            k=10)
                    top_tokens = []
                    for w, _ in predict_words:
                        top_tokens.append(w)

                    if top_tokens and (s not in top_tokens):
                        # 取得所有可能正确的词
                        candidates = self.generate_items(s)
                        if candidates:
                            for token_str in top_tokens:
                                if token_str in candidates:
                                    details.append(
                                        (s, token_str, start_idx + idx,
                                         start_idx + idx + 1))
                                    s = token_str
                                    break
                blk_new += s
            text_new += blk_new
        details = sorted(details, key=operator.itemgetter(2))
        return text_new, details
Esempio n. 19
0
def _generate_items(word, fraction=1):
    candidates_1_order = []
    candidates_2_order = []
    candidates_3_order = []
    # same pinyin word
    candidates_1_order.extend(get_confusion_word_set(word))
    # same pinyin char
    if len(word) == 1:
        # same pinyin
        confusion = [i for i in get_confusion_char_set(word[0]) if i]
        candidates_2_order.extend(confusion)
    if len(word) > 1:
        # same first pinyin
        confusion = [i + word[1:] for i in get_confusion_char_set(word[0]) if i]
        candidates_2_order.extend(confusion)
        # same last pinyin
        confusion = [word[:-1] + i for i in get_confusion_char_set(word[-1]) if i]
        candidates_2_order.extend(confusion)
        if len(word) > 2:
            # same mid char pinyin
            confusion = [word[0] + i + word[2:] for i in get_confusion_char_set(word[1])]
            candidates_3_order.extend(confusion)

            # same first word pinyin
            confusion_word = [i + word[-1] for i in get_confusion_word_set(word[:-1])]
            candidates_1_order.extend(confusion_word)

            # same last word pinyin
            confusion_word = [word[0] + i for i in get_confusion_word_set(word[1:])]
            candidates_1_order.extend(confusion_word)

    # add all confusion word list
    confusion_word_set = set(candidates_1_order + candidates_2_order + candidates_3_order)
    confusion_word_list = [item for item in confusion_word_set if is_chinese_string(item)]
    confusion_sorted = sorted(confusion_word_list, key=lambda k: \
        get_frequency(k), reverse=True)
    return confusion_sorted[:len(confusion_word_list) // fraction + 1]
Esempio n. 20
0
    def generate_items(self, word, fragment=1):
        """
        生成纠错候选集
        :param word:
        :param fragment: 分段
        :return:
        """
        self.check_corrector_initialized()
        # 1字
        candidates_1 = []
        # 2字
        candidates_2 = []
        # 多于2字
        candidates_3 = []

        # same pinyin word
        candidates_1.extend(self._confusion_word_set(word))
        # 第一种情况:confusion,如果是自定义混淆集中的错误,直接修改为对应的正确的值就可以了。
        # custom confusion word
        candidates_1.extend(self._confusion_custom_set(word))
        """
        第二种情况:对于word和char两种情况,没有对应的正确的值,就需要通过语言模型来从候选集中找了。

        候选集的构造逻辑如下,输入是item,也就是检错阶段得到的可能的错误词。首先,同音字和形近字表共同可以构建一个基于字的混淆集(confusion_char_set)。其次,借助于常用字表和item之间的编辑距离,可以构建一个比较粗的候选词集,通过常用词表可以做一个过滤,最后加上同音的限制条件,可以得到一个更小的基于词的混淆集(confusion_word_set)。最后,还有一个自定义的混淆集(confusion_custom _set)。

        有了上述的表,就可以构建候选集了。通过候选词的长度来分情况讨论:

        第一:长度为1。直接利用基于字的混淆集来替换。

        第二:长度为2。分别替换每一个字。

        第三:长度为3。同上。

        最后,合并所有的候选集。那么通过上述的构造过程,可能导致一些无效词或者字的出现,因此要做一些过滤处理,最后按照选出一些候选集的子集来处理。代码中的规则是基于词频来处理,选择topk个词作为候选集。
        """
        # same pinyin char
        if len(word) == 1:
            # same one char pinyin
            confusion = [i for i in self._confusion_char_set(word[0]) if i]
            candidates_1.extend(confusion)
        if len(word) == 2:
            # same first char pinyin
            confusion = [
                i + word[1:] for i in self._confusion_char_set(word[0]) if i
            ]
            candidates_2.extend(confusion)
            # same last char pinyin
            confusion = [
                word[:-1] + i for i in self._confusion_char_set(word[-1]) if i
            ]
            candidates_2.extend(confusion)
        if len(word) > 2:
            # same mid char pinyin
            confusion = [
                word[0] + i + word[2:]
                for i in self._confusion_char_set(word[1])
            ]
            candidates_3.extend(confusion)

            # same first word pinyin
            confusion_word = [
                i + word[-1] for i in self._confusion_word_set(word[:-1])
            ]
            candidates_3.extend(confusion_word)

            # same last word pinyin
            confusion_word = [
                word[0] + i for i in self._confusion_word_set(word[1:])
            ]
            candidates_3.extend(confusion_word)

        # add all confusion word list
        confusion_word_set = set(candidates_1 + candidates_2 + candidates_3)
        confusion_word_list = [
            item for item in confusion_word_set if is_chinese_string(item)
        ]
        confusion_sorted = sorted(confusion_word_list,
                                  key=lambda k: self.word_frequency(k),
                                  reverse=True)
        return confusion_sorted[:len(confusion_word_list) // fragment + 1]
Esempio n. 21
0
def _generate_items(sentence, idx, word, fraction=1):

    if len(word) == 1:
        confusion_word_set = set([i for i in get_confusion_char_set(word[0]) if i])

    if len(word) > 1:

        def combine_two_confusion_char(sentence, idx, word):
            # # assuming there is only two char to change
            # # definitely not the final version, need to be fixed!!!!
            result = set()
            for i in range(len(word) - 1):
                for j in range(i + 1,len(word)):
                    result |= set([word[: i] + i_word + word[i + 1: j] + j_word + word[j + 1:] \
                                   for i_word in get_confusion_char_set(word[i]) if i_word \
                                   for j_word in get_confusion_char_set(word[j]) if j_word])
            return result

        def confusion_set(sentence, idx, word):
            # maximum number of change char is set up by 'edit_distance'

            # the maximum edit-distance
            edit_distance = 2

            cands_tmp = [['',0]]
            result = set()
            ids = list(range(int(idx.split(',')[0]), int(idx.split(',')[1])))

            # # change individual char
            while cands_tmp:

                if len(cands_tmp[0][0]) == len(word):
                    result.add(cands_tmp[0][0])

                elif cands_tmp[0][1] == edit_distance:
                    result.add(cands_tmp[0][0] + word[len(cands_tmp[0][0]):])

                else:
                    target_idx = ids[len(cands_tmp[0][0])]
                    for char_cand in get_confusion_char_set(sentence[target_idx]):

                        if target_idx == 0:
                            if char_cand + sentence[target_idx + 1] not in two_char_dict:
                                continue

                        elif target_idx == len(sentence) - 1:
                            if sentence[target_idx - 1] + char_cand not in two_char_dict:
                                continue

                        elif char_cand + sentence[target_idx + 1] not in two_char_dict and \
                             sentence[target_idx - 1] + char_cand not in two_char_dict:
                            continue
                        
                        if char_cand == sentence[target_idx]:
                            cands_tmp.append([cands_tmp[0][0] + char_cand, cands_tmp[0][1]])
                        else:
                            cands_tmp.append([cands_tmp[0][0] + char_cand, cands_tmp[0][1] + 1])

                cands_tmp.pop(0)

            # # change connected two chars
            for i in range(len(word) - 1):
                for char_i in get_confusion_char_set(word[i]):
                    for char_j in get_confusion_char_set(word[i + 1]):
                        if char_i + char_j in two_char_dict:
                            result.add(word[:i] + char_i + char_j + word[i + 2:])

            return result

        confusion_word_set = confusion_set(sentence, idx, word)

    confusion_word_list = [item for item in confusion_word_set if is_chinese_string(item)]
    confusion_sorted = sorted(confusion_word_list, key=lambda k: get_frequency(k), reverse=True)

    return confusion_sorted[:len(confusion_word_list) // fraction + 1]
Esempio n. 22
0
sys.path.append("../")
from pypinyin import lazy_pinyin
from pycorrector.utils.text_utils import traditional2simplified, simplified2traditional
from pycorrector.utils.text_utils import get_homophones_by_char, get_homophones_by_pinyin

traditional_sentence = '憂郁的臺灣烏龜'
simplified_sentence = traditional2simplified(traditional_sentence)
print(simplified_sentence)

simplified_sentence = '忧郁的台湾乌龟'
traditional_sentence = simplified2traditional(simplified_sentence)
print(traditional_sentence)

print(lazy_pinyin('中心'))  # 不带音调


pron = get_homophones_by_char('长')
print('get_homophones_by_char:', pron)

pron = get_homophones_by_pinyin('zha1ng')
print('get_homophones_by_pinyin:', pron)

from pycorrector.utils.text_utils import is_chinese, is_chinese_string
s = """现在 银色的K2P是MTK还是博通啊?李雯雯……“00后”选手
啥123kMk.23?? ''"’
"""
print(s, is_chinese_string(s))

for i in s:
    print(i, is_chinese(i))