Esempio n. 1
0
def convert_data_from_raw_files(source_file, target_file, output_file,
                                chunk_size):
    tagged = []
    source_data, target_data = read_parallel_lines(source_file, target_file)
    print(f"The size of raw dataset is {len(source_data)}")
    cnt_total, cnt_all, cnt_tp = 0, 0, 0
    for source_sent, target_sent in tqdm(zip(source_data,
                                             target_data)):  # 耗时的需要做tqdm
        try:
            aligned_sent = align_sequences(source_sent, target_sent)
        except Exception:
            aligned_sent = align_sequences(source_sent, target_sent)
        if source_sent != target_sent:
            cnt_tp += 1
        alignments = [aligned_sent]
        cnt_all += len(alignments)
        try:
            check_sent = convert_tagged_line(aligned_sent)
        except Exception:
            # debug mode
            aligned_sent = align_sequences(source_sent, target_sent)
            check_sent = convert_tagged_line(aligned_sent)

        if "".join(check_sent.split()) != "".join(target_sent.split()):
            # do it again for debugging
            aligned_sent = align_sequences(source_sent, target_sent)
            check_sent = convert_tagged_line(aligned_sent)
            print(f"Incorrect pair: \n{target_sent}\n{check_sent}")
            continue
        if alignments:
            cnt_total += len(alignments)
            tagged.extend(alignments)
        if len(tagged) > chunk_size:
            write_lines(output_file, tagged, mode='a')
            tagged = []

    print(f"Overall extracted {cnt_total}. "
          f"Original TP {cnt_tp}."
          f" Original TN {cnt_all - cnt_tp}")
    if tagged:  # tagged 就是我们最后需要的文件.
        write_lines(output_file, tagged, 'a')
Esempio n. 2
0
def convert_data_from_raw_files(source_file, target_file, output_file,
                                chunk_size):
    tagged = []
    source_data, target_data = read_parallel_lines(source_file, target_file)

    # 把句号前后加上空格.
    import re
    for i in range(len(source_data)):
        tmp = re.sub(r'\.', r' . ', source_data[i]).rstrip()  # 先替换.
        source_data[i] = re.sub(r' +', r' ', tmp).rstrip()  # 再替换多余空格
    for j in range(len(target_data)):
        # tmp = re.sub(r'\.', r' . ', target_data[i]).rstrip() # 先替换.
        # target_data[i] = re.sub(r' +', r' ', tmp).rstrip() # 再替换多余空格
        # 还需要控制大小写.
        old = target_data[j]
        tmp = re.sub(r'\.', r' . ', old).rstrip()
        tmp = re.sub(r' +', r' ', tmp).rstrip()
        old = tmp
        tmp = re.finditer(r'\. .', tmp)
        tmp = list(tmp)
        list3 = []
        for i in tmp:
            list3.append(i.span()[-1] - 1)
        # print(list(tmp))
        #
        # print(list3)
        # print(old)
        out = ''
        for i in range(len(old)):
            if i not in list3:
                out += old[i]
            else:
                out += str.upper(old[i])
        target_data[j] = out
        # print(out)


# --------------下面开始正式的预处理代码!!!!!!!!!!!!!!!!!!!
    print(f"The size of raw dataset is {len(source_data)}")
    cnt_total, cnt_all, cnt_tp = 0, 0, 0
    for source_sent, target_sent in tqdm(zip(source_data,
                                             target_data)):  # 耗时的需要做tqdm
        try:
            aligned_sent = align_sequences(source_sent, target_sent)
        except Exception:
            aligned_sent = align_sequences(source_sent, target_sent)
        if source_sent != target_sent:
            cnt_tp += 1
        alignments = [aligned_sent]
        cnt_all += len(alignments)
        try:
            check_sent = convert_tagged_line(aligned_sent)
        except Exception:
            # debug mode
            aligned_sent = align_sequences(source_sent, target_sent)
            check_sent = convert_tagged_line(aligned_sent)

        if "".join(check_sent.split()) != "".join(target_sent.split()):
            # do it again for debugging
            aligned_sent = align_sequences(source_sent, target_sent)
            check_sent = convert_tagged_line(aligned_sent)
            print(f"Incorrect pair: \n{target_sent}\n{check_sent}")
            continue
        if alignments:
            cnt_total += len(alignments)
            tagged.extend(alignments)
        if len(tagged) > chunk_size:
            write_lines(output_file, tagged, mode='a')
            tagged = []

    print(f"Overall extracted {cnt_total}. "
          f"Original TP {cnt_tp}."
          f" Original TN {cnt_all - cnt_tp}")
    from pathlib import Path
    output_filedir = Path(
        __file__).resolve().parent.parent / output_file  # 获取绝对路径的方法
    if tagged:  # tagged 就是我们最后需要的文件.
        write_lines(output_filedir, tagged, 'w')