def convert_data_from_raw_files(source_file, target_file, output_file, chunk_size): tagged = [] source_data, target_data = read_parallel_lines(source_file, target_file) print(f"The size of raw dataset is {len(source_data)}") cnt_total, cnt_all, cnt_tp = 0, 0, 0 for source_sent, target_sent in tqdm(zip(source_data, target_data)): # 耗时的需要做tqdm try: aligned_sent = align_sequences(source_sent, target_sent) except Exception: aligned_sent = align_sequences(source_sent, target_sent) if source_sent != target_sent: cnt_tp += 1 alignments = [aligned_sent] cnt_all += len(alignments) try: check_sent = convert_tagged_line(aligned_sent) except Exception: # debug mode aligned_sent = align_sequences(source_sent, target_sent) check_sent = convert_tagged_line(aligned_sent) if "".join(check_sent.split()) != "".join(target_sent.split()): # do it again for debugging aligned_sent = align_sequences(source_sent, target_sent) check_sent = convert_tagged_line(aligned_sent) print(f"Incorrect pair: \n{target_sent}\n{check_sent}") continue if alignments: cnt_total += len(alignments) tagged.extend(alignments) if len(tagged) > chunk_size: write_lines(output_file, tagged, mode='a') tagged = [] print(f"Overall extracted {cnt_total}. " f"Original TP {cnt_tp}." f" Original TN {cnt_all - cnt_tp}") if tagged: # tagged 就是我们最后需要的文件. write_lines(output_file, tagged, 'a')
def convert_data_from_raw_files(source_file, target_file, output_file, chunk_size): tagged = [] source_data, target_data = read_parallel_lines(source_file, target_file) # 把句号前后加上空格. import re for i in range(len(source_data)): tmp = re.sub(r'\.', r' . ', source_data[i]).rstrip() # 先替换. source_data[i] = re.sub(r' +', r' ', tmp).rstrip() # 再替换多余空格 for j in range(len(target_data)): # tmp = re.sub(r'\.', r' . ', target_data[i]).rstrip() # 先替换. # target_data[i] = re.sub(r' +', r' ', tmp).rstrip() # 再替换多余空格 # 还需要控制大小写. old = target_data[j] tmp = re.sub(r'\.', r' . ', old).rstrip() tmp = re.sub(r' +', r' ', tmp).rstrip() old = tmp tmp = re.finditer(r'\. .', tmp) tmp = list(tmp) list3 = [] for i in tmp: list3.append(i.span()[-1] - 1) # print(list(tmp)) # # print(list3) # print(old) out = '' for i in range(len(old)): if i not in list3: out += old[i] else: out += str.upper(old[i]) target_data[j] = out # print(out) # --------------下面开始正式的预处理代码!!!!!!!!!!!!!!!!!!! print(f"The size of raw dataset is {len(source_data)}") cnt_total, cnt_all, cnt_tp = 0, 0, 0 for source_sent, target_sent in tqdm(zip(source_data, target_data)): # 耗时的需要做tqdm try: aligned_sent = align_sequences(source_sent, target_sent) except Exception: aligned_sent = align_sequences(source_sent, target_sent) if source_sent != target_sent: cnt_tp += 1 alignments = [aligned_sent] cnt_all += len(alignments) try: check_sent = convert_tagged_line(aligned_sent) except Exception: # debug mode aligned_sent = align_sequences(source_sent, target_sent) check_sent = convert_tagged_line(aligned_sent) if "".join(check_sent.split()) != "".join(target_sent.split()): # do it again for debugging aligned_sent = align_sequences(source_sent, target_sent) check_sent = convert_tagged_line(aligned_sent) print(f"Incorrect pair: \n{target_sent}\n{check_sent}") continue if alignments: cnt_total += len(alignments) tagged.extend(alignments) if len(tagged) > chunk_size: write_lines(output_file, tagged, mode='a') tagged = [] print(f"Overall extracted {cnt_total}. " f"Original TP {cnt_tp}." f" Original TN {cnt_all - cnt_tp}") from pathlib import Path output_filedir = Path( __file__).resolve().parent.parent / output_file # 获取绝对路径的方法 if tagged: # tagged 就是我们最后需要的文件. write_lines(output_filedir, tagged, 'w')