Example #1
0
def parse_xml_file(path):
    print('Parse data from %s' % path)
    data_list = []
    dom_tree = minidom.parse(path)
    docs = dom_tree.documentElement.getElementsByTagName('DOC')
    for doc in docs:
        # Input the text
        text = doc.getElementsByTagName('TEXT')[0]. \
            childNodes[0].data.strip()
        # Input the correct text
        correction = doc.getElementsByTagName('CORRECTION')[0]. \
            childNodes[0].data.strip()

        texts = split_2_short_text(text)
        corrections = split_2_short_text(correction)
        if len(texts) != len(corrections):
            print('error:' + text + '\t' + correction)
            continue
        for i in range(len(texts)):
            if len(texts[i]) > 40:
                print('error:' + texts[i] + '\t' + corrections[i])
                continue
            source = segment(texts[i], cut_type='char')
            target = segment(corrections[i], cut_type='char')
            pair = [source, target]
            if pair not in data_list:
                data_list.append(pair)
    return data_list
Example #2
0
def parse_xml_file(path):
    print('Parse data from %s' % path)
    data_list = []
    dom_tree = minidom.parse(path)
    docs = dom_tree.documentElement.getElementsByTagName('DOC')
    for doc in docs:
        # Input the text
        text = doc.getElementsByTagName('TEXT')[0]. \
            childNodes[0].data.strip()
        # Input the correct text
        correction = doc.getElementsByTagName('CORRECTION')[0]. \
            childNodes[0].data.strip()
        # Segment
        source = segment(text, cut_type='char')
        target = segment(correction, cut_type='char')
        data_list.append([source, target])
    return data_list
Example #3
0
def parse_txt_file(input_path, truth_path):
    print('Parse data from %s and %s' % (input_path, truth_path))
    id_lst, word_lst, label_lst = [], [], []
    # read truth file
    truth_dict = {}
    with open(truth_path, 'r', encoding='utf-8') as truth_f:
        for line in truth_f:
            parts = line.strip().split(',')
            # Locate the error position
            locate_dict = {}
            if len(parts) == 4:
                text_id = parts[0]
                start_off = parts[1]
                end_off = parts[2]
                error_type = parts[3].strip()
                for i in range(int(start_off) - 1, int(end_off)):
                    if i == int(start_off) - 1:
                        error_type_change = 'B-' + error_type
                    else:
                        error_type_change = 'I-' + error_type
                    # locate_dict[i] = error_type_change
                    locate_dict[i] = error_type
                # for i in range(int(start_off) - 1, int(end_off)):
                #     locate_dict[i] = error_type
                if text_id in truth_dict:
                    truth_dict[text_id].update(locate_dict)
                else:
                    truth_dict[text_id] = locate_dict

    # read input file and get tokenize
    with open(input_path, 'r', encoding='utf-8') as input_f:
        for line in input_f:
            parts = line.strip().split('\t')
            text_id = parts[0].replace('(sid=', '').replace(')', '')
            text = parts[1]
            # segment with pos
            word_seq, pos_seq = segment(text, cut_type='char', pos=True)
            word_arr, label_arr = [], []
            if text_id in truth_dict:
                locate_dict = truth_dict[text_id]
                for i in range(len(word_seq)):
                    if i in locate_dict:
                        word_arr.append(word_seq[i])
                        # fill with error type
                        label_arr.append(locate_dict[i])
                    else:
                        word_arr.append(word_seq[i])
                        # fill with pos tag
                        label_arr.append(pos_seq[i])
            else:
                word_arr = word_seq
                label_arr = pos_seq
            id_lst.append(text_id)
            word_lst.append(word_arr)
            label_lst.append(label_arr)
    return id_lst, word_lst, label_lst
Example #4
0
def parse_xml_file(path):
    print('Parse data from %s' % path)
    word_arr = []
    with open(path, 'r', encoding='utf-8') as f:
        dom_tree = minidom.parse(f)
    docs = dom_tree.documentElement.getElementsByTagName('DOC')
    for doc in docs:
        # Input the text
        text = doc.getElementsByTagName('CORRECTION')[0]. \
            childNodes[0].data.strip()
        # Segment
        word_seq = segment(text, cut_type='char', pos=False)
        word_arr.append(word_seq)
    return word_arr
Example #5
0
def parse_xml_file(path):
    print('Parse data from %s' % path)
    id_lst, word_lst, label_lst = [], [], []
    with open(path, 'r', encoding='utf-8') as f:
        dom_tree = minidom.parse(path)
    docs = dom_tree.documentElement.getElementsByTagName('DOC')
    for doc in docs:
        # Input the text
        text = doc.getElementsByTagName('TEXT')[0]. \
            childNodes[0].data.strip()
        text_id = doc.getElementsByTagName('TEXT')[0].getAttribute('id')
        errors = doc.getElementsByTagName('ERROR')
        # Locate the error position and error type
        locate_dict = {}
        for error in errors:
            start_off = error.getAttribute('start_off')
            end_off = error.getAttribute('end_off')
            error_type = error.getAttribute('type')
            for i in range(int(start_off) - 1, int(end_off)):
                if i == int(start_off) - 1:
                    error_type_change = 'B-' + error_type
                else:
                    error_type_change = 'I-' + error_type
                # locate_dict[i] = error_type_change
                locate_dict[i] = error_type
        # Segment with pos
        word_seq, pos_seq = segment(text, cut_type='char', pos=True)
        word_arr, label_arr = [], []
        for i in range(len(word_seq)):
            if i in locate_dict:
                word_arr.append(word_seq[i])
                # Fill with error type
                label_arr.append(locate_dict[i])
            else:
                word_arr.append(word_seq[i])
                # Fill with pos tag
                label_arr.append(pos_seq[i])
        id_lst.append(text_id)
        word_lst.append(word_arr)
        label_lst.append(label_arr)
    return id_lst, word_lst, label_lst
Example #6
0
        default_logger.warn("index error, sentence:" + sentence + ie)
    except Exception as e:
        default_logger.warn("detect error, sentence:" + sentence + e)
    return sorted(maybe_error_indices)


if __name__ == '__main__':
    sent = '少先队员因该为老人让坐'
    # sent = '机七学习是人工智能领遇最能体现智能的一个分知'
    error_list = detect(sent)
    print(error_list)

    sent_chars = [sent[i] for i in error_list]
    print(sent_chars)

    from pycorrector.utils.text_utils import segment, tokenize

    print(get_ngram_score(segment(sent)))
    print(get_ppl_score(segment(sent)))

    print(get_ngram_score(list(sent), mode=trigram_char))
    print(get_ppl_score(list(sent), mode=trigram_char))

    sent = '少先队员应该为老人让座'
    print(detect(sent))
    print(get_ngram_score(segment(sent)))
    print(get_ppl_score(segment(sent)))

    print(get_ngram_score(list(sent), mode=trigram_char))
    print(get_ppl_score(list(sent), mode=trigram_char))
Example #7
0
        pass
    except Exception as e:
        print("detect error, sentence:", sentence, e)
    return sorted(maybe_error_indices)


if __name__ == '__main__':
    sent = '少先队员因该为老人让坐'
    # sent = '机七学习是人工智能领遇最能体现智能的一个分知'
    error_list = detect(sent)
    print(error_list)

    sent_chars = [sent[i] for i in error_list]
    print(sent_chars)

    from pycorrector.utils.text_utils import segment, tokenize

    print(get_ngram_score(segment(sent)))
    print(get_ppl_score(segment(sent)))

    print(get_ngram_score(list(sent), mode=trigram_char))
    print(get_ppl_score(list(sent), mode=trigram_char))

    sent = '少先队员应该为老人让座'
    print(detect(sent))
    print(get_ngram_score(segment(sent)))
    print(get_ppl_score(segment(sent)))

    print(get_ngram_score(list(sent), mode=trigram_char))
    print(get_ppl_score(list(sent), mode=trigram_char))