def parse_xml_file(path, use_short_text=False, maximum_length=200): print('Parse data from %s' % path) data_list = [] dom_tree = minidom.parse(path) docs = dom_tree.documentElement.getElementsByTagName('DOC') for doc in docs: # Input the text text = doc.getElementsByTagName('TEXT')[0]. \ childNodes[0].data.strip() # Input the correct text correction = doc.getElementsByTagName('CORRECTION')[0]. \ childNodes[0].data.strip() if use_short_text: texts = split_2_short_text(text) corrections = split_2_short_text(correction) else: texts = [text] corrections = [correction] if len(texts) != len(corrections): print('error diff:' + text + '\t' + correction) continue for i in range(len(texts)): if len(texts[i]) > maximum_length: print('error long:' + texts[i] + '\t' + corrections[i]) continue source = segment(texts[i], cut_type='char') target = segment(corrections[i], cut_type='char') pair = [source, target] if pair not in data_list: data_list.append(pair) return data_list
def test_ner(): from pycorrector.utils.tokenizer import segment from pycorrector.corrector import Corrector c = Corrector() c.check_corrector_initialized() c.check_detector_initialized() error_sentences = [ '这个消息在北京城里不胫儿走', '大家已经满头大汉了,休息吧', '我不要你花钱,这些路曲近通幽', # 曲径通幽 '这个消息不胫儿走', '这个消息不径而走', # 胫 '真的是无稽之谈', '真的是无集之谈', # 集 '小丽宝儿的学习成绩一落千仗太失望了', '肉骨头是索然无味', '肉骨头是索染无味', # 然 '看书是一心一意,绝不东张夕望,好厉害。', # 西 "复方甘草口服液好喝吗", '新进人员时,知识当然还不过,可是人有很有精神,面对工作很认真的话,很快就学会、体会。', ] for line in error_sentences: print(line) print("segment:", segment(line)) print("tokenize:", c.tokenizer.tokenize(line)) print(c.detect(line)) correct_sent = c.correct(line) print("original sentence:{} => correct sentence:{}".format(line, correct_sent))
def segment(self, text, cut_type='char'): """ 纠错模块的切词,默认采用字粒度 :param text: 需要切词的句子 :return: list """ return segment(text, cut_type=cut_type)
def test_segment(): """测试疾病名纠错""" error_sentence_1 = '这个新药奥美砂坦脂片能治疗心绞痛,效果还可以' # 奥美沙坦酯片 print(error_sentence_1) print(segment(error_sentence_1)) import jieba print(list(jieba.tokenize(error_sentence_1)))
def segment_file(path): data_list = [] with open(path, 'r', encoding='utf-8') as f: for line in f: line = line.strip() if line.startswith("#"): continue parts = line.split("\t") if len(parts) != 2: continue source = segment(parts[0].strip(), cut_type='char') target = segment(parts[1].strip(), cut_type='char') pair = [source, target] if pair not in data_list: data_list.append(pair) return data_list
def parse_xml_file(path): print('Parse data from %s' % path) data_list = [] dom_tree = minidom.parse(path) docs = dom_tree.documentElement.getElementsByTagName('DOC') for doc in docs: # Input the text text = doc.getElementsByTagName('TEXT')[0]. \ childNodes[0].data.strip() # Input the correct text correction = doc.getElementsByTagName('CORRECTION')[0]. \ childNodes[0].data.strip() # Segment source = segment(text, cut_type='char') target = segment(correction, cut_type='char') data_list.append([source, target]) return data_list
def test_segment(): """测试疾病名纠错""" error_sentence_1 = '这个新药奥美砂坦脂片能治疗心绞痛,效果还可以' # 奥美沙坦酯片 print(error_sentence_1) print(segment(error_sentence_1)) import jieba print(list(jieba.tokenize(error_sentence_1))) import jieba.posseg as pseg words = pseg.lcut("我爱北京天安门") # jieba默认模式 print('old:', words)
def parse_xml_file(path): print('Parse data from %s' % path) word_arr = [] with open(path, 'r', encoding='utf-8') as f: dom_tree = minidom.parse(f) docs = dom_tree.documentElement.getElementsByTagName('DOC') for doc in docs: # Input the text text = doc.getElementsByTagName('CORRECTION')[0]. \ childNodes[0].data.strip() # Segment word_seq = segment(text, cut_type='char', pos=False) word_arr.append(word_seq) return word_arr
def parse_xml_file(path, use_segment, segment_type): print('Parse data from %s' % path) data_list = [] dom_tree = minidom.parse(path) docs = dom_tree.documentElement.getElementsByTagName('DOC') for doc in docs: # Input the text text = doc.getElementsByTagName('TEXT')[0]. \ childNodes[0].data.strip() # Input the correct text correction = doc.getElementsByTagName('CORRECTION')[0]. \ childNodes[0].data.strip() source = ' '.join( segment(text.strip(), cut_type=segment_type)) if use_segment else text.strip() target = ' '.join(segment( correction.strip(), cut_type=segment_type)) if use_segment else correction.strip() pair = [source, target] if pair not in data_list: data_list.append(pair) return data_list
def get_data_file(path, use_segment, segment_type): data_list = [] with open(path, 'r', encoding='utf-8') as f: for line in f: line = line.strip() if line.startswith("#"): continue parts = line.split("\t") if len(parts) != 2: continue target = ' '.join(segment( parts[1].strip(), cut_type=segment_type)) if use_segment else parts[1].strip() data_list.append(target) return data_list
def parse_xml_file(path, use_segment, segment_type): print('Parse data from %s' % path) word_arr = [] dom_tree = minidom.parse(path) docs = dom_tree.documentElement.getElementsByTagName('DOC') for doc in docs: # Input the text text = doc.getElementsByTagName('CORRECTION')[0]. \ childNodes[0].data.strip() # Segment word_seq = ' '.join( segment(text.strip(), cut_type=segment_type)) if use_segment else text.strip() word_arr.append(word_seq) return word_arr
def ernie_correct(self, text, ernie_cut_type='char'): """ 句子纠错 :param text: 句子文本 :param ernie_cut_type: 切词类型(char/word) :return: corrected_text, list[list], [error_word, correct_word, begin_pos, end_pos] """ text_new = '' details = [] self.check_corrector_initialized() # 编码统一,utf-8 to unicode text = convert_to_unicode(text) # 长句切分为短句 blocks = self.split_text_by_maxlen(text, maxlen=512) for blk, start_idx in blocks: blk_new = '' blk = segment(blk, cut_type=ernie_cut_type, pos=False) for idx, s in enumerate(blk): # 处理中文错误 if is_chinese_string(s): sentence_lst = blk[:idx] + blk[idx:] sentence_lst[idx] = self.mask_token * len(s) sentence_new = ' '.join(sentence_lst) # 预测,默认取top5 predicts = self.predict_mask(sentence_new) top_tokens = [] for p in predicts: top_tokens.append(p.get('token', '')) if top_tokens and (s not in top_tokens): # 取得所有可能正确的词 candidates = self.generate_items(s) if candidates: for token_str in top_tokens: if token_str in candidates: details.append([ s, token_str, start_idx + idx, start_idx + idx + 1 ]) s = token_str break blk_new += s text_new += blk_new details = sorted(details, key=operator.itemgetter(2)) return text_new, details
def get_lm_correct_item(self, cur_item, candidates, before_sent, after_sent, threshold=57, cut_type='char'): """ 通过语言模型纠正字词错误 :param cur_item: 当前词 :param candidates: 候选词 :param before_sent: 前半部分句子 :param after_sent: 后半部分句子 :param threshold: ppl阈值, 原始字词替换后大于该ppl值则认为是错误 :param cut_type: 切词方式, 字粒度 :return: str, correct item, 正确的字词 """ result = cur_item if cur_item not in candidates: candidates.append(cur_item) ppl_scores = { i: self.ppl_score( segment(before_sent + i + after_sent, cut_type=cut_type)) for i in candidates } sorted_ppl_scores = sorted(ppl_scores.items(), key=lambda d: d[1]) # 增加正确字词的修正范围,减少误纠 top_items = [] top_score = 0.0 for i, v in enumerate(sorted_ppl_scores): v_word = v[0] v_score = v[1] if i == 0: top_score = v_score top_items.append(v_word) # 通过阈值修正范围 elif v_score < top_score + threshold: top_items.append(v_word) else: break if cur_item not in top_items: result = top_items[0] return result
from pycorrector.utils.tokenizer import segment error_sentences = [ '我不要你花钱,这些路曲近通幽', # 曲径通幽 '这个消息不胫儿走', '这个消息不径而走', # 胫 '真的是无稽之谈', '真的是无集之谈', # 集 '肉骨头是索然无味', '肉骨头是索染无味', # 然 '看书是一心一意,绝不东张夕望,好厉害。', # 西 "氨漠索注射液乙基", "丙卡特罗片(美普清)乙", "瓦贝沙坦技囊(伊泰青)乙省基", "复方氨基酸lt(18EAA利泰))甲,限〉基", "橘红痰咳液(限)乙省基", "兰索拉哇肠溶片乙省基", "氯化钾缓釋片甲基", "葡萄糖打甲基", "小牛曲清去蛋白提取物乙", "头抱曲松针(罗氏芬)申基", "复方甘草口服溶液限田基", '新进人员时,知识当然还不过,可是人有很有精神,面对工作很认真的话,很快就学会、体会。', ] for line in error_sentences: print(line) print("segment:", segment(line)) print(pycorrector.detect(line)) correct_sent = pycorrector.correct(line) print("original sentence:{} => correct sentence:{}".format( line, correct_sent))
'一只小鱼船浮在平净的河面上', # [['船浮', '船夫', 4, 6],error; ['平净', '平静', 7, 9]])right '我的家乡是有明的渔米之乡', # [['有明', '有名', 5, 7], ['渔米', '鱼米', 8, 10]])right; [['渔米', '玉米', 8, 10]])error ' _ ,', '我对于宠物出租得事非常认同,因为其实很多人喜欢宠物', # 出租的事 '有了宠物出租地方另一方面还可以题高人类对动物的了解,因为那些专业人氏可以指导我们对于动物的习惯。', # 题高 => 提高 专业人氏 => 专业人士right; [['宠', '重', 2, 3], ['方面', '方便', 10, 12],error '三个凑皮匠胜过一个诸葛亮也有道理。', # [['三个凑皮匠', '三个臭皮匠', 0, 5]]) '还有广告业是只要桌子前面坐者工作未必产生出来好的成果。', ] d = Corrector() for i in error_sentences: print(i, d.detect(i)) sent1 = '少先队员应该为老人让座' sent_seg = segment(sent1) ppl = d.ppl_score(sent_seg) print(sent1, 'ppl_score:', ppl) sent2 = '少先队员因该为老人让坐' sent_seg = segment(sent2) ppl = d.ppl_score(sent_seg) print(sent2, 'ppl_score:', ppl) print(sent1, d.detect(sent1)) print(sent2, d.detect(sent2)) freq = d.word_frequency('龟龙麟凤') print('freq:', freq)