def parse_xml_file(path): print('Parse data from %s' % path) data_list = [] with open(path, 'r', encoding='utf-8') as f: dom_tree = minidom.parse(f) docs = dom_tree.documentElement.getElementsByTagName('DOC') for doc in docs: # Input the text text = doc.getElementsByTagName('TEXT')[0]. \ childNodes[0].data.strip() # Input the correct text correction = doc.getElementsByTagName('CORRECTION')[0]. \ childNodes[0].data.strip() # Segment source = segment(text, cut_type='char') target = segment(correction, cut_type='char') data_list.append([source, target]) return data_list
def decode_sentence(sess, model, data_reader, sentence, corrective_tokens=set(), verbose=True): """Used with InteractiveSession in IPython """ return next( decode(sess, model, data_reader, [segment(sentence, 'char')], corrective_tokens=corrective_tokens, verbose=verbose))
def parse_txt_file(input_path, truth_path): print('Parse data from %s and %s' % (input_path, truth_path)) id_lst, word_lst, label_lst = [], [], [] # read truth file truth_dict = {} with open(truth_path, 'r', encoding='utf-8') as truth_f: for line in truth_f: parts = line.strip().split(',') # Locate the error position locate_dict = {} if len(parts) == 4: text_id = parts[0] start_off = parts[1] end_off = parts[2] error_type = parts[3] for i in range(int(start_off) - 1, int(end_off)): locate_dict[i] = error_type if text_id in truth_dict: truth_dict[text_id].update(locate_dict) else: truth_dict[text_id] = locate_dict # read input file and get tokenize with open(input_path, 'r', encoding='utf-8') as input_f: for line in input_f: parts = line.strip().split('\t') text_id = parts[0].replace('(sid=', '').replace(')', '') text = parts[1] # segment with pos word_seq, pos_seq = segment(text, cut_type='char', pos=True) word_arr, label_arr = [], [] if text_id in truth_dict: locate_dict = truth_dict[text_id] for i in range(len(word_seq)): if i in locate_dict: word_arr.append(word_seq[i]) # fill with error type label_arr.append(locate_dict[i]) else: word_arr.append(word_seq[i]) # fill with pos tag label_arr.append(pos_seq[i]) else: word_arr = word_seq label_arr = pos_seq id_lst.append(text_id) word_lst.append(word_arr) label_lst.append(label_arr) return id_lst, word_lst, label_lst
def parse_xml_file(path): print('Parse data from %s' % path) id_lst, word_lst, label_lst = [], [], [] with open(path, 'r', encoding='utf-8') as f: dom_tree = minidom.parse(path) docs = dom_tree.documentElement.getElementsByTagName('DOC') for doc in docs: # Input the text text = doc.getElementsByTagName('TEXT')[0]. \ childNodes[0].data.strip() text_id = doc.getElementsByTagName('TEXT')[0].getAttribute('id') errors = doc.getElementsByTagName('ERROR') # Locate the error position and error type locate_dict = {} for error in errors: start_off = error.getAttribute('start_off') end_off = error.getAttribute('end_off') error_type = error.getAttribute('type') for i in range(int(start_off) - 1, int(end_off)): locate_dict[i] = error_type # Segment with pos word_seq, pos_seq = segment(text, cut_type='char', pos=True) word_arr, label_arr = [], [] for i in range(len(word_seq)): if i in locate_dict: word_arr.append(word_seq[i]) # Fill with error type label_arr.append(locate_dict[i]) else: word_arr.append(word_seq[i]) # Fill with pos tag label_arr.append(pos_seq[i]) id_lst.append(text_id) word_lst.append(word_arr) label_lst.append(label_arr) return id_lst, word_lst, label_lst
pass except: print("detect error, sentence:", sentence) return sorted(maybe_error_indices) if __name__ == '__main__': sent = '少先队员因该为老人让坐' # sent = '机七学习是人工智能领遇最能体现智能的一个分知' error_list = detect(sent) print(error_list) sent_chars = [sent[i] for i in error_list] print(sent_chars) from utils.text_utils import segment, tokenize print(get_ngram_score(segment(sent))) print(get_ppl_score(segment(sent))) print(get_ngram_score(list(sent), mode=trigram_char)) print(get_ppl_score(list(sent), mode=trigram_char)) sent = '少先队员应该为老人让座' print(detect(sent)) print(get_ngram_score(segment(sent))) print(get_ppl_score(segment(sent))) print(get_ngram_score(list(sent), mode=trigram_char)) print(get_ppl_score(list(sent), mode=trigram_char))