def parse_xml_file(path): print('Parse data from %s' % path) data_list = [] dom_tree = minidom.parse(path) docs = dom_tree.documentElement.getElementsByTagName('DOC') for doc in docs: # Input the text text = doc.getElementsByTagName('TEXT')[0]. \ childNodes[0].data.strip() # Input the correct text correction = doc.getElementsByTagName('CORRECTION')[0]. \ childNodes[0].data.strip() texts = split_2_short_text(text) corrections = split_2_short_text(correction) if len(texts) != len(corrections): print('error:' + text + '\t' + correction) continue for i in range(len(texts)): if len(texts[i]) > 40: print('error:' + texts[i] + '\t' + corrections[i]) continue source = segment(texts[i], cut_type='char') target = segment(corrections[i], cut_type='char') pair = [source, target] if pair not in data_list: data_list.append(pair) return data_list
def parse_xml_file(path): print('Parse data from %s' % path) data_list = [] dom_tree = minidom.parse(path) docs = dom_tree.documentElement.getElementsByTagName('DOC') for doc in docs: # Input the text text = doc.getElementsByTagName('TEXT')[0]. \ childNodes[0].data.strip() # Input the correct text correction = doc.getElementsByTagName('CORRECTION')[0]. \ childNodes[0].data.strip() # Segment source = segment(text, cut_type='char') target = segment(correction, cut_type='char') data_list.append([source, target]) return data_list
def parse_txt_file(input_path, truth_path): print('Parse data from %s and %s' % (input_path, truth_path)) id_lst, word_lst, label_lst = [], [], [] # read truth file truth_dict = {} with open(truth_path, 'r', encoding='utf-8') as truth_f: for line in truth_f: parts = line.strip().split(',') # Locate the error position locate_dict = {} if len(parts) == 4: text_id = parts[0] start_off = parts[1] end_off = parts[2] error_type = parts[3].strip() for i in range(int(start_off) - 1, int(end_off)): if i == int(start_off) - 1: error_type_change = 'B-' + error_type else: error_type_change = 'I-' + error_type # locate_dict[i] = error_type_change locate_dict[i] = error_type # for i in range(int(start_off) - 1, int(end_off)): # locate_dict[i] = error_type if text_id in truth_dict: truth_dict[text_id].update(locate_dict) else: truth_dict[text_id] = locate_dict # read input file and get tokenize with open(input_path, 'r', encoding='utf-8') as input_f: for line in input_f: parts = line.strip().split('\t') text_id = parts[0].replace('(sid=', '').replace(')', '') text = parts[1] # segment with pos word_seq, pos_seq = segment(text, cut_type='char', pos=True) word_arr, label_arr = [], [] if text_id in truth_dict: locate_dict = truth_dict[text_id] for i in range(len(word_seq)): if i in locate_dict: word_arr.append(word_seq[i]) # fill with error type label_arr.append(locate_dict[i]) else: word_arr.append(word_seq[i]) # fill with pos tag label_arr.append(pos_seq[i]) else: word_arr = word_seq label_arr = pos_seq id_lst.append(text_id) word_lst.append(word_arr) label_lst.append(label_arr) return id_lst, word_lst, label_lst
def parse_xml_file(path): print('Parse data from %s' % path) word_arr = [] with open(path, 'r', encoding='utf-8') as f: dom_tree = minidom.parse(f) docs = dom_tree.documentElement.getElementsByTagName('DOC') for doc in docs: # Input the text text = doc.getElementsByTagName('CORRECTION')[0]. \ childNodes[0].data.strip() # Segment word_seq = segment(text, cut_type='char', pos=False) word_arr.append(word_seq) return word_arr
def parse_xml_file(path): print('Parse data from %s' % path) id_lst, word_lst, label_lst = [], [], [] with open(path, 'r', encoding='utf-8') as f: dom_tree = minidom.parse(path) docs = dom_tree.documentElement.getElementsByTagName('DOC') for doc in docs: # Input the text text = doc.getElementsByTagName('TEXT')[0]. \ childNodes[0].data.strip() text_id = doc.getElementsByTagName('TEXT')[0].getAttribute('id') errors = doc.getElementsByTagName('ERROR') # Locate the error position and error type locate_dict = {} for error in errors: start_off = error.getAttribute('start_off') end_off = error.getAttribute('end_off') error_type = error.getAttribute('type') for i in range(int(start_off) - 1, int(end_off)): if i == int(start_off) - 1: error_type_change = 'B-' + error_type else: error_type_change = 'I-' + error_type # locate_dict[i] = error_type_change locate_dict[i] = error_type # Segment with pos word_seq, pos_seq = segment(text, cut_type='char', pos=True) word_arr, label_arr = [], [] for i in range(len(word_seq)): if i in locate_dict: word_arr.append(word_seq[i]) # Fill with error type label_arr.append(locate_dict[i]) else: word_arr.append(word_seq[i]) # Fill with pos tag label_arr.append(pos_seq[i]) id_lst.append(text_id) word_lst.append(word_arr) label_lst.append(label_arr) return id_lst, word_lst, label_lst
default_logger.warn("index error, sentence:" + sentence + ie) except Exception as e: default_logger.warn("detect error, sentence:" + sentence + e) return sorted(maybe_error_indices) if __name__ == '__main__': sent = '少先队员因该为老人让坐' # sent = '机七学习是人工智能领遇最能体现智能的一个分知' error_list = detect(sent) print(error_list) sent_chars = [sent[i] for i in error_list] print(sent_chars) from pycorrector.utils.text_utils import segment, tokenize print(get_ngram_score(segment(sent))) print(get_ppl_score(segment(sent))) print(get_ngram_score(list(sent), mode=trigram_char)) print(get_ppl_score(list(sent), mode=trigram_char)) sent = '少先队员应该为老人让座' print(detect(sent)) print(get_ngram_score(segment(sent))) print(get_ppl_score(segment(sent))) print(get_ngram_score(list(sent), mode=trigram_char)) print(get_ppl_score(list(sent), mode=trigram_char))
pass except Exception as e: print("detect error, sentence:", sentence, e) return sorted(maybe_error_indices) if __name__ == '__main__': sent = '少先队员因该为老人让坐' # sent = '机七学习是人工智能领遇最能体现智能的一个分知' error_list = detect(sent) print(error_list) sent_chars = [sent[i] for i in error_list] print(sent_chars) from pycorrector.utils.text_utils import segment, tokenize print(get_ngram_score(segment(sent))) print(get_ppl_score(segment(sent))) print(get_ngram_score(list(sent), mode=trigram_char)) print(get_ppl_score(list(sent), mode=trigram_char)) sent = '少先队员应该为老人让座' print(detect(sent)) print(get_ngram_score(segment(sent))) print(get_ppl_score(segment(sent))) print(get_ngram_score(list(sent), mode=trigram_char)) print(get_ppl_score(list(sent), mode=trigram_char))