Exemple #1
0
def parse_xml_file(path):
    print('Parse data from %s' % path)
    data_list = []
    with open(path, 'r', encoding='utf-8') as f:
        dom_tree = minidom.parse(f)
    docs = dom_tree.documentElement.getElementsByTagName('DOC')
    for doc in docs:
        # Input the text
        text = doc.getElementsByTagName('TEXT')[0]. \
            childNodes[0].data.strip()
        # Input the correct text
        correction = doc.getElementsByTagName('CORRECTION')[0]. \
            childNodes[0].data.strip()
        # Segment
        source = segment(text, cut_type='char')
        target = segment(correction, cut_type='char')
        data_list.append([source, target])
    return data_list
Exemple #2
0
def parse_xml_file(path):
    print('Parse data from %s' % path)
    data_list = []
    with open(path, 'r', encoding='utf-8') as f:
        dom_tree = minidom.parse(f)
    docs = dom_tree.documentElement.getElementsByTagName('DOC')
    for doc in docs:
        # Input the text
        text = doc.getElementsByTagName('TEXT')[0]. \
            childNodes[0].data.strip()
        # Input the correct text
        correction = doc.getElementsByTagName('CORRECTION')[0]. \
            childNodes[0].data.strip()
        # Segment
        source = segment(text, cut_type='char')
        target = segment(correction, cut_type='char')
        data_list.append([source, target])
    return data_list
Exemple #3
0
def decode_sentence(sess,
                    model,
                    data_reader,
                    sentence,
                    corrective_tokens=set(),
                    verbose=True):
    """Used with InteractiveSession in IPython """
    return next(
        decode(sess,
               model,
               data_reader, [segment(sentence, 'char')],
               corrective_tokens=corrective_tokens,
               verbose=verbose))
Exemple #4
0
def parse_txt_file(input_path, truth_path):
    print('Parse data from %s and %s' % (input_path, truth_path))
    id_lst, word_lst, label_lst = [], [], []
    # read truth file
    truth_dict = {}
    with open(truth_path, 'r', encoding='utf-8') as truth_f:
        for line in truth_f:
            parts = line.strip().split(',')
            # Locate the error position
            locate_dict = {}
            if len(parts) == 4:
                text_id = parts[0]
                start_off = parts[1]
                end_off = parts[2]
                error_type = parts[3]
                for i in range(int(start_off) - 1, int(end_off)):
                    locate_dict[i] = error_type
                if text_id in truth_dict:
                    truth_dict[text_id].update(locate_dict)
                else:
                    truth_dict[text_id] = locate_dict

    # read input file and get tokenize
    with open(input_path, 'r', encoding='utf-8') as input_f:
        for line in input_f:
            parts = line.strip().split('\t')
            text_id = parts[0].replace('(sid=', '').replace(')', '')
            text = parts[1]
            # segment with pos
            word_seq, pos_seq = segment(text, cut_type='char', pos=True)
            word_arr, label_arr = [], []
            if text_id in truth_dict:
                locate_dict = truth_dict[text_id]
                for i in range(len(word_seq)):
                    if i in locate_dict:
                        word_arr.append(word_seq[i])
                        # fill with error type
                        label_arr.append(locate_dict[i])
                    else:
                        word_arr.append(word_seq[i])
                        # fill with pos tag
                        label_arr.append(pos_seq[i])
            else:
                word_arr = word_seq
                label_arr = pos_seq
            id_lst.append(text_id)
            word_lst.append(word_arr)
            label_lst.append(label_arr)
    return id_lst, word_lst, label_lst
Exemple #5
0
def parse_xml_file(path):
    print('Parse data from %s' % path)
    id_lst, word_lst, label_lst = [], [], []
    with open(path, 'r', encoding='utf-8') as f:
        dom_tree = minidom.parse(path)
    docs = dom_tree.documentElement.getElementsByTagName('DOC')
    for doc in docs:
        # Input the text
        text = doc.getElementsByTagName('TEXT')[0]. \
            childNodes[0].data.strip()
        text_id = doc.getElementsByTagName('TEXT')[0].getAttribute('id')
        errors = doc.getElementsByTagName('ERROR')
        # Locate the error position and error type
        locate_dict = {}
        for error in errors:
            start_off = error.getAttribute('start_off')
            end_off = error.getAttribute('end_off')
            error_type = error.getAttribute('type')
            for i in range(int(start_off) - 1, int(end_off)):
                locate_dict[i] = error_type
        # Segment with pos
        word_seq, pos_seq = segment(text, cut_type='char', pos=True)
        word_arr, label_arr = [], []
        for i in range(len(word_seq)):
            if i in locate_dict:
                word_arr.append(word_seq[i])
                # Fill with error type
                label_arr.append(locate_dict[i])
            else:
                word_arr.append(word_seq[i])
                # Fill with pos tag
                label_arr.append(pos_seq[i])
        id_lst.append(text_id)
        word_lst.append(word_arr)
        label_lst.append(label_arr)
    return id_lst, word_lst, label_lst
Exemple #6
0
        pass
    except:
        print("detect error, sentence:", sentence)
    return sorted(maybe_error_indices)


if __name__ == '__main__':
    sent = '少先队员因该为老人让坐'
    # sent = '机七学习是人工智能领遇最能体现智能的一个分知'
    error_list = detect(sent)
    print(error_list)

    sent_chars = [sent[i] for i in error_list]
    print(sent_chars)

    from utils.text_utils import segment, tokenize

    print(get_ngram_score(segment(sent)))
    print(get_ppl_score(segment(sent)))

    print(get_ngram_score(list(sent), mode=trigram_char))
    print(get_ppl_score(list(sent), mode=trigram_char))

    sent = '少先队员应该为老人让座'
    print(detect(sent))
    print(get_ngram_score(segment(sent)))
    print(get_ppl_score(segment(sent)))

    print(get_ngram_score(list(sent), mode=trigram_char))
    print(get_ppl_score(list(sent), mode=trigram_char))