def main(src_path, translated_path, output_path, num_sentence): src_lines = read_file(src_path) translated_lines = read_file(translated_path) formatted = defaultdict(list) for src in src_lines: for i in range(num_sentence): formatted[src].append(translated_lines.pop(0)) with open(output_path, 'w') as f: json.dump(formatted, f)
def main(src_full_path, tgt_full_path, src_train_path, tgt_train_path, src_valid_path, tgt_valid_path, validation_ratio, src_test_path, tgt_test_path, test_ratio): # 1. Tokenize sentences and write to temp file nlp = get_spacy_model() with nlp.disable_pipes('ner'): src_temp_path = "src_temp.txt" src_lines = read_file(src_full_path) write_file( list(" ".join(tokens) for tokens in [[token.text for token in doc] for doc in nlp.pipe(src_lines)]), src_temp_path) tgt_temp_path = "tgt_temp.txt" tgt_lines = read_file(tgt_full_path) write_file( list(" ".join(tokens) for tokens in [[token.text for token in doc] for doc in nlp.pipe(tgt_lines)]), tgt_temp_path) # 2. Split into train, validation, test files src_valid_test_temp_path = "src_valid_test_temp.txt" split_file(src_temp_path, src_train_path, src_valid_test_temp_path, validation_ratio + test_ratio) tgt_valid_test_temp_path = "tgt_valid_test_temp.txt" split_file(tgt_temp_path, tgt_train_path, tgt_valid_test_temp_path, validation_ratio + test_ratio) ratio = validation_ratio / (validation_ratio + test_ratio) split_file(src_valid_test_temp_path, src_valid_path, src_test_path, ratio) split_file(tgt_valid_test_temp_path, tgt_valid_path, tgt_test_path, ratio) # 3. Delete temp files print("Cleaning up...") delete_file(src_temp_path, tgt_temp_path, src_valid_test_temp_path, tgt_valid_test_temp_path)
def _bootstrap_parallel(corpus, src_path, threshold, src_output_path, tgt_output_path): print("Bootstrapping pseudo-parallel corpus...") source_sentences = file_util.read_file(src_path, unique=True) progress_bar = tqdm(total=len(source_sentences)) thread_lock = threading.Lock() def run_thread(corpus_, threshold_, batch_, update_dict): nonlocal progress_bar for source_sentence in batch_: most_similar_sentence = corpus_.get_most_similar_sentence( source_sentence, threshold_) if most_similar_sentence is not None: update_dict[source_sentence] = most_similar_sentence with thread_lock: progress_bar.update() pseudo_parallel_corpus = dict() threads = list() num_threads = 5 batch_size = math.ceil(len(source_sentences) / num_threads) for i in range(0, len(source_sentences), batch_size): batch = source_sentences[i:i + batch_size] thread = threading.Thread(target=run_thread, args=(corpus, threshold, batch, pseudo_parallel_corpus)) threads.append(thread) thread.start() for thread in threads: thread.join() progress_bar.close() file_util.write_file(list(pseudo_parallel_corpus.keys()), src_output_path) file_util.write_file(list(pseudo_parallel_corpus.values()), tgt_output_path)
def _refine_parallel(corpus, src_parallel_path, tgt_parallel_path, candidate_path, src_output_path, tgt_output_path): print("Refinining pseudo-parallel corpus...") def get_refined_target(src, current_tgt, candidate_tgt): """ Refine a pair of pseudo-parallel sentences. """ current_wmd = nlp_util.get_word_mover_dist(src, current_tgt) candidate_wmd = nlp_util.get_word_mover_dist(src, candidate_tgt) new_target, new_target_wmd = (current_tgt, current_wmd) if current_wmd < candidate_wmd \ else (candidate_tgt, candidate_wmd) most_similar_original = corpus.get_most_similar_sentence(new_target, threshold=0) original_wmd = nlp_util.get_word_mover_dist(src, most_similar_original) return new_target if new_target_wmd < original_wmd else most_similar_original src_sentences = file_util.read_file(src_parallel_path) tgt_sentences = file_util.read_file(tgt_parallel_path) candidates = file_util.read_json(candidate_path) pseudo_parallel_corpus = { src_sentences[i]: tgt_sentences[i] for i in range(len(src_sentences)) } progress_bar = tqdm(total=len(src_sentences)) refined_count = 0 thread_lock = threading.Lock() def run_thread(src_batch_, tgt_batch_): nonlocal progress_bar nonlocal refined_count for j in range(len(src_batch_)): source_sentence = src_batch_[j] current_target = tgt_batch_[j] candidate = candidates[source_sentence][0] refined_target = get_refined_target(source_sentence, current_target, candidate) if pseudo_parallel_corpus[source_sentence] != refined_target: with thread_lock: refined_count += 1 pseudo_parallel_corpus[source_sentence] = refined_target with thread_lock: progress_bar.update() threads = list() num_threads = 5 batch_size = math.ceil(len(src_sentences) / num_threads) for i in range(0, len(src_sentences), batch_size): src_batch = src_sentences[i:i + batch_size] tgt_batch = tgt_sentences[i:i + batch_size] thread = threading.Thread(target=run_thread, args=(src_batch, tgt_batch)) threads.append(thread) thread.start() for thread in threads: thread.join() progress_bar.close() file_util.write_file(pseudo_parallel_corpus.keys(), src_output_path) file_util.write_file(pseudo_parallel_corpus.values(), tgt_output_path) update_rate = refined_count / len(pseudo_parallel_corpus) print(f"Number of pairs refined: {refined_count} ({update_rate * 100}%)") return update_rate