Beispiel #1
0
def text_corrector(text, confusion: bool = True, non_level="char"):
    """Correct Text

    Check text whether contains wrong word. If `confusion` is True, add 
    customize confusion words(can add more words, file path is 
    `config/ConfusionWords.txt`) that is a file path. Word level can use `char`, 
    `word` or None, `char` is character level which can check single wrong word; 
    None is can't support single word level

    Parameters:
    @type bool, confusion, whether add confusion word
    @type string, non_level, choose a level, `char` close character level, `word`
        close word level. None can support the two level

    Results:
    @type dict, results, wrong words, and those start index

    Examples:
    >>> text_corrector('少先队员因该为老人让坐') # add confusion and use char level
        defaultdict(dict, {0: {'word': '因该', 'start_index': 4}})
    >>> # add confusion and use all level
    >>> text_corrector('少先队员因该为老人让坐', non_level=None)
        {'因该': 4, '坐': 10}
    """
    # add confusion dict
    if confusion:
        pycorrector.set_custom_confusion_dict(CONFUSION_PATH)

    # import ipdb; ipdb.set_trace()
    # get wrong word information
    report = pycorrector.detect(text)
    if len(report) > 0:
        results = []
    else:
        return []

    for index, item in enumerate(report):
        result = {}
        if non_level is None:
            result["word"] = item[0]
            result["start_index"] = item[1]
        elif non_level == "char" and len(item[0]) > 1:
            result["word"] = item[0]
            result["start_index"] = item[1]
        elif non_level == "word" and len(item[0]) == 1:
            result["word"] = item[0]
            result["start_index"] = item[1]

        if result:
            results.append(result)

    return results
Beispiel #2
0
    def test_confusion_dict(self):
        sents = [
            '买iphonex,要多少钱',
            '共同实际控制人萧华、霍荣铨、张旗康',
        ]
        res = []
        for name in sents:
            s, r = pycorrector.correct(name)
            print(r)
            res.append(r)

        self.assertEqual(res[0], [])
        self.assertEqual(res[1], [('张旗康', '张启康', 14, 17)])

        pycorrector.set_custom_confusion_dict('../examples/my_custom_confusion.txt')
        res = []
        for name in sents:
            s, r = pycorrector.correct(name)
            print(r)
            res.append(r)
        self.assertEqual(res[0], [('iphonex', 'iphoneX', 1, 8)])
        self.assertEqual(res[1], [])
Beispiel #3
0
    def initialize(self):
        jieba.set_dictionary(f'{self.server_assets_path}/dict.big.txt')
        jieba.load_userdict(f'{self.server_assets_path}/ase_jieba_dict.txt')
        set_custom_confusion_dict(
            path=f'{self.server_assets_path}/my_custom_confusion.txt')
        set_custom_word_freq(
            path=f'{self.server_assets_path}/my_custom_word_freq.txt')

        with open(f'{self.server_assets_path}/{self.stopword_file}',
                  'r',
                  encoding='utf-8') as f:
            self.stop_words = f.read().split('\n')
            f.close()

        with open(f'{self.server_assets_path}/{self.keyword_file}',
                  'r',
                  encoding='utf-8') as f:
            self.keywords = [
                keyword for keyword in map(lambda x: x.replace('\n', ''),
                                           f.readlines())
            ]
            f.close()
if __name__ == '__main__':

    error_sentences = [
        '买iphonex,要多少钱',  # 漏召回
        '我想喝小明同学。',  # 漏召回
        '哪里卖苹果吧?请大叔给我让坐',  # 漏召回
        '交通先行了怎么过去啊?',  # 漏召回
        '共同实际控制人萧华、霍荣铨、张旗康',  # 误杀
        '上述承诺内容系本人真实意思表示',  # 正常
        '大家一哄而伞怎么回事',  # 成语
    ]
    for line in error_sentences:
        print(pycorrector.correct(line))

    print('*' * 42)
    pycorrector.set_custom_confusion_dict(path='./my_custom_confusion.txt')
    for line in error_sentences:
        print(pycorrector.correct(line))

# ('买iphonex,要多少钱', [])
# ('我想喝小明同学。', [])
# ('哪里卖苹果吧?请大叔给我让坐', [])
# ('交通先行了怎么过去啊?', [])
# ('共同实际控制人萧华、霍荣铨、张启康', [['张旗康', '张启康', 14, 17]])
# ('上述承诺内容系本人真实意思表示', [])
# *****************************************************
# ('买iphoneX,要多少钱', [['iphonex', 'iphoneX', 1, 8]])
# ('我想喝小茗同学。', [['小明同学', '小茗同学', 3, 7]])
# ('哪里卖苹果八?请大叔给我让坐', [['苹果吧', '苹果八', 3, 6]])
# ('交通限行了怎么过去啊?', [['交通先行', '交通限行', 0, 4]])
# ('共同实际控制人萧华、霍荣铨、张旗康', [])