Beispiel #1
0
def parse_xml_file(path, use_short_text=False, maximum_length=200):
    print('Parse data from %s' % path)
    data_list = []
    dom_tree = minidom.parse(path)
    docs = dom_tree.documentElement.getElementsByTagName('DOC')
    for doc in docs:
        # Input the text
        text = doc.getElementsByTagName('TEXT')[0]. \
            childNodes[0].data.strip()
        # Input the correct text
        correction = doc.getElementsByTagName('CORRECTION')[0]. \
            childNodes[0].data.strip()

        if use_short_text:
            texts = split_2_short_text(text)
            corrections = split_2_short_text(correction)
        else:
            texts = [text]
            corrections = [correction]
        if len(texts) != len(corrections):
            print('error diff:' + text + '\t' + correction)
            continue
        for i in range(len(texts)):
            if len(texts[i]) > maximum_length:
                print('error long:' + texts[i] + '\t' + corrections[i])
                continue
            source = segment(texts[i], cut_type='char')
            target = segment(corrections[i], cut_type='char')
            pair = [source, target]
            if pair not in data_list:
                data_list.append(pair)
    return data_list
def test_ner():
    from pycorrector.utils.tokenizer import segment
    from pycorrector.corrector import Corrector
    c = Corrector()
    c.check_corrector_initialized()
    c.check_detector_initialized()
    error_sentences = [
        '这个消息在北京城里不胫儿走',
        '大家已经满头大汉了,休息吧',
        '我不要你花钱,这些路曲近通幽',  # 曲径通幽
        '这个消息不胫儿走',
        '这个消息不径而走',  # 胫
        '真的是无稽之谈',
        '真的是无集之谈',  # 集
        '小丽宝儿的学习成绩一落千仗太失望了',
        '肉骨头是索然无味',
        '肉骨头是索染无味',  # 然
        '看书是一心一意,绝不东张夕望,好厉害。',  # 西
        "复方甘草口服液好喝吗",
        '新进人员时,知识当然还不过,可是人有很有精神,面对工作很认真的话,很快就学会、体会。',
    ]
    for line in error_sentences:
        print(line)
        print("segment:", segment(line))
        print("tokenize:", c.tokenizer.tokenize(line))
        print(c.detect(line))
        correct_sent = c.correct(line)
        print("original sentence:{} => correct sentence:{}".format(line, correct_sent))
Beispiel #3
0
 def segment(self, text, cut_type='char'):
     """
     纠错模块的切词,默认采用字粒度
     :param text: 需要切词的句子
     :return: list
     """
     return segment(text, cut_type=cut_type)
def test_segment():
    """测试疾病名纠错"""
    error_sentence_1 = '这个新药奥美砂坦脂片能治疗心绞痛,效果还可以'  # 奥美沙坦酯片
    print(error_sentence_1)
    print(segment(error_sentence_1))
    import jieba
    print(list(jieba.tokenize(error_sentence_1)))
Beispiel #5
0
def segment_file(path):
    data_list = []
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line.startswith("#"):
                continue
            parts = line.split("\t")
            if len(parts) != 2:
                continue
            source = segment(parts[0].strip(), cut_type='char')
            target = segment(parts[1].strip(), cut_type='char')

            pair = [source, target]
            if pair not in data_list:
                data_list.append(pair)
    return data_list
Beispiel #6
0
def parse_xml_file(path):
    print('Parse data from %s' % path)
    data_list = []
    dom_tree = minidom.parse(path)
    docs = dom_tree.documentElement.getElementsByTagName('DOC')
    for doc in docs:
        # Input the text
        text = doc.getElementsByTagName('TEXT')[0]. \
            childNodes[0].data.strip()
        # Input the correct text
        correction = doc.getElementsByTagName('CORRECTION')[0]. \
            childNodes[0].data.strip()
        # Segment
        source = segment(text, cut_type='char')
        target = segment(correction, cut_type='char')
        data_list.append([source, target])
    return data_list
Beispiel #7
0
def test_segment():
    """测试疾病名纠错"""
    error_sentence_1 = '这个新药奥美砂坦脂片能治疗心绞痛,效果还可以'  # 奥美沙坦酯片
    print(error_sentence_1)
    print(segment(error_sentence_1))
    import jieba
    print(list(jieba.tokenize(error_sentence_1)))
    import jieba.posseg as pseg
    words = pseg.lcut("我爱北京天安门")  # jieba默认模式
    print('old:', words)
Beispiel #8
0
def parse_xml_file(path):
    print('Parse data from %s' % path)
    word_arr = []
    with open(path, 'r', encoding='utf-8') as f:
        dom_tree = minidom.parse(f)
    docs = dom_tree.documentElement.getElementsByTagName('DOC')
    for doc in docs:
        # Input the text
        text = doc.getElementsByTagName('CORRECTION')[0]. \
            childNodes[0].data.strip()
        # Segment
        word_seq = segment(text, cut_type='char', pos=False)
        word_arr.append(word_seq)
    return word_arr
Beispiel #9
0
def parse_xml_file(path, use_segment, segment_type):
    print('Parse data from %s' % path)
    data_list = []
    dom_tree = minidom.parse(path)
    docs = dom_tree.documentElement.getElementsByTagName('DOC')
    for doc in docs:
        # Input the text
        text = doc.getElementsByTagName('TEXT')[0]. \
            childNodes[0].data.strip()
        # Input the correct text
        correction = doc.getElementsByTagName('CORRECTION')[0]. \
            childNodes[0].data.strip()

        source = ' '.join(
            segment(text.strip(),
                    cut_type=segment_type)) if use_segment else text.strip()
        target = ' '.join(segment(
            correction.strip(),
            cut_type=segment_type)) if use_segment else correction.strip()

        pair = [source, target]
        if pair not in data_list:
            data_list.append(pair)
    return data_list
Beispiel #10
0
def get_data_file(path, use_segment, segment_type):
    data_list = []
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line.startswith("#"):
                continue
            parts = line.split("\t")
            if len(parts) != 2:
                continue
            target = ' '.join(segment(
                parts[1].strip(),
                cut_type=segment_type)) if use_segment else parts[1].strip()
            data_list.append(target)
    return data_list
Beispiel #11
0
def parse_xml_file(path, use_segment, segment_type):
    print('Parse data from %s' % path)
    word_arr = []
    dom_tree = minidom.parse(path)
    docs = dom_tree.documentElement.getElementsByTagName('DOC')
    for doc in docs:
        # Input the text
        text = doc.getElementsByTagName('CORRECTION')[0]. \
            childNodes[0].data.strip()
        # Segment
        word_seq = ' '.join(
            segment(text.strip(),
                    cut_type=segment_type)) if use_segment else text.strip()
        word_arr.append(word_seq)
    return word_arr
Beispiel #12
0
    def ernie_correct(self, text, ernie_cut_type='char'):
        """
        句子纠错
        :param text: 句子文本
        :param ernie_cut_type: 切词类型(char/word)
        :return: corrected_text, list[list], [error_word, correct_word, begin_pos, end_pos]
        """
        text_new = ''
        details = []
        self.check_corrector_initialized()
        # 编码统一,utf-8 to unicode
        text = convert_to_unicode(text)
        # 长句切分为短句
        blocks = self.split_text_by_maxlen(text, maxlen=512)
        for blk, start_idx in blocks:
            blk_new = ''
            blk = segment(blk, cut_type=ernie_cut_type, pos=False)
            for idx, s in enumerate(blk):
                # 处理中文错误
                if is_chinese_string(s):
                    sentence_lst = blk[:idx] + blk[idx:]
                    sentence_lst[idx] = self.mask_token * len(s)
                    sentence_new = ' '.join(sentence_lst)
                    # 预测,默认取top5
                    predicts = self.predict_mask(sentence_new)
                    top_tokens = []
                    for p in predicts:
                        top_tokens.append(p.get('token', ''))

                    if top_tokens and (s not in top_tokens):
                        # 取得所有可能正确的词
                        candidates = self.generate_items(s)
                        if candidates:
                            for token_str in top_tokens:
                                if token_str in candidates:
                                    details.append([
                                        s, token_str, start_idx + idx,
                                        start_idx + idx + 1
                                    ])
                                    s = token_str
                                    break
                blk_new += s
            text_new += blk_new
        details = sorted(details, key=operator.itemgetter(2))
        return text_new, details
Beispiel #13
0
    def get_lm_correct_item(self,
                            cur_item,
                            candidates,
                            before_sent,
                            after_sent,
                            threshold=57,
                            cut_type='char'):
        """
        通过语言模型纠正字词错误
        :param cur_item: 当前词
        :param candidates: 候选词
        :param before_sent: 前半部分句子
        :param after_sent: 后半部分句子
        :param threshold: ppl阈值, 原始字词替换后大于该ppl值则认为是错误
        :param cut_type: 切词方式, 字粒度
        :return: str, correct item, 正确的字词
        """
        result = cur_item
        if cur_item not in candidates:
            candidates.append(cur_item)

        ppl_scores = {
            i: self.ppl_score(
                segment(before_sent + i + after_sent, cut_type=cut_type))
            for i in candidates
        }
        sorted_ppl_scores = sorted(ppl_scores.items(), key=lambda d: d[1])

        # 增加正确字词的修正范围,减少误纠
        top_items = []
        top_score = 0.0
        for i, v in enumerate(sorted_ppl_scores):
            v_word = v[0]
            v_score = v[1]
            if i == 0:
                top_score = v_score
                top_items.append(v_word)
            # 通过阈值修正范围
            elif v_score < top_score + threshold:
                top_items.append(v_word)
            else:
                break
        if cur_item not in top_items:
            result = top_items[0]
        return result
Beispiel #14
0
from pycorrector.utils.tokenizer import segment
error_sentences = [
    '我不要你花钱,这些路曲近通幽',  # 曲径通幽
    '这个消息不胫儿走',
    '这个消息不径而走',  # 胫
    '真的是无稽之谈',
    '真的是无集之谈',  # 集
    '肉骨头是索然无味',
    '肉骨头是索染无味',  # 然
    '看书是一心一意,绝不东张夕望,好厉害。',  # 西
    "氨漠索注射液乙基",
    "丙卡特罗片(美普清)乙",
    "瓦贝沙坦技囊(伊泰青)乙省基",
    "复方氨基酸lt(18EAA利泰))甲,限〉基",
    "橘红痰咳液(限)乙省基",
    "兰索拉哇肠溶片乙省基",
    "氯化钾缓釋片甲基",
    "葡萄糖打甲基",
    "小牛曲清去蛋白提取物乙",
    "头抱曲松针(罗氏芬)申基",
    "复方甘草口服溶液限田基",
    '新进人员时,知识当然还不过,可是人有很有精神,面对工作很认真的话,很快就学会、体会。',
]
for line in error_sentences:
    print(line)
    print("segment:", segment(line))
    print(pycorrector.detect(line))
    correct_sent = pycorrector.correct(line)
    print("original sentence:{} => correct sentence:{}".format(
        line, correct_sent))
Beispiel #15
0
    '一只小鱼船浮在平净的河面上',  # [['船浮', '船夫', 4, 6],error; ['平净', '平静', 7, 9]])right
    '我的家乡是有明的渔米之乡',  # [['有明', '有名', 5, 7], ['渔米', '鱼米', 8, 10]])right;  [['渔米', '玉米', 8, 10]])error
    ' _ ,',
    '我对于宠物出租得事非常认同,因为其实很多人喜欢宠物',  # 出租的事
    '有了宠物出租地方另一方面还可以题高人类对动物的了解,因为那些专业人氏可以指导我们对于动物的习惯。',
    # 题高 => 提高 专业人氏 => 专业人士right; [['宠', '重', 2, 3], ['方面', '方便', 10, 12],error
    '三个凑皮匠胜过一个诸葛亮也有道理。',  # [['三个凑皮匠', '三个臭皮匠', 0, 5]])
    '还有广告业是只要桌子前面坐者工作未必产生出来好的成果。',

]

d = Corrector()
for i in error_sentences:
    print(i, d.detect(i))

sent1 = '少先队员应该为老人让座'
sent_seg = segment(sent1)
ppl = d.ppl_score(sent_seg)
print(sent1, 'ppl_score:', ppl)

sent2 = '少先队员因该为老人让坐'
sent_seg = segment(sent2)
ppl = d.ppl_score(sent_seg)
print(sent2, 'ppl_score:', ppl)

print(sent1, d.detect(sent1))
print(sent2, d.detect(sent2))

freq = d.word_frequency('龟龙麟凤')
print('freq:', freq)