Esempio n. 1
0
def main(src_path, translated_path, output_path, num_sentence):
    src_lines = read_file(src_path)
    translated_lines = read_file(translated_path)
    formatted = defaultdict(list)

    for src in src_lines:
        for i in range(num_sentence):
            formatted[src].append(translated_lines.pop(0))

    with open(output_path, 'w') as f:
        json.dump(formatted, f)
def main(src_full_path, tgt_full_path, src_train_path, tgt_train_path,
         src_valid_path, tgt_valid_path, validation_ratio, src_test_path,
         tgt_test_path, test_ratio):
    # 1. Tokenize sentences and write to temp file
    nlp = get_spacy_model()
    with nlp.disable_pipes('ner'):
        src_temp_path = "src_temp.txt"
        src_lines = read_file(src_full_path)
        write_file(
            list(" ".join(tokens)
                 for tokens in [[token.text for token in doc]
                                for doc in nlp.pipe(src_lines)]),
            src_temp_path)

        tgt_temp_path = "tgt_temp.txt"
        tgt_lines = read_file(tgt_full_path)
        write_file(
            list(" ".join(tokens)
                 for tokens in [[token.text for token in doc]
                                for doc in nlp.pipe(tgt_lines)]),
            tgt_temp_path)

    # 2. Split into train, validation, test files
    src_valid_test_temp_path = "src_valid_test_temp.txt"
    split_file(src_temp_path, src_train_path, src_valid_test_temp_path,
               validation_ratio + test_ratio)

    tgt_valid_test_temp_path = "tgt_valid_test_temp.txt"
    split_file(tgt_temp_path, tgt_train_path, tgt_valid_test_temp_path,
               validation_ratio + test_ratio)

    ratio = validation_ratio / (validation_ratio + test_ratio)
    split_file(src_valid_test_temp_path, src_valid_path, src_test_path, ratio)
    split_file(tgt_valid_test_temp_path, tgt_valid_path, tgt_test_path, ratio)

    # 3. Delete temp files
    print("Cleaning up...")
    delete_file(src_temp_path, tgt_temp_path, src_valid_test_temp_path,
                tgt_valid_test_temp_path)
Esempio n. 3
0
def _bootstrap_parallel(corpus, src_path, threshold, src_output_path,
                        tgt_output_path):
    print("Bootstrapping pseudo-parallel corpus...")
    source_sentences = file_util.read_file(src_path, unique=True)

    progress_bar = tqdm(total=len(source_sentences))
    thread_lock = threading.Lock()

    def run_thread(corpus_, threshold_, batch_, update_dict):
        nonlocal progress_bar
        for source_sentence in batch_:
            most_similar_sentence = corpus_.get_most_similar_sentence(
                source_sentence, threshold_)

            if most_similar_sentence is not None:
                update_dict[source_sentence] = most_similar_sentence

            with thread_lock:
                progress_bar.update()

    pseudo_parallel_corpus = dict()
    threads = list()
    num_threads = 5
    batch_size = math.ceil(len(source_sentences) / num_threads)
    for i in range(0, len(source_sentences), batch_size):
        batch = source_sentences[i:i + batch_size]
        thread = threading.Thread(target=run_thread,
                                  args=(corpus, threshold, batch,
                                        pseudo_parallel_corpus))
        threads.append(thread)
        thread.start()

    for thread in threads:
        thread.join()

    progress_bar.close()

    file_util.write_file(list(pseudo_parallel_corpus.keys()), src_output_path)
    file_util.write_file(list(pseudo_parallel_corpus.values()),
                         tgt_output_path)
Esempio n. 4
0
def _refine_parallel(corpus, src_parallel_path, tgt_parallel_path,
                     candidate_path, src_output_path, tgt_output_path):
    print("Refinining pseudo-parallel corpus...")

    def get_refined_target(src, current_tgt, candidate_tgt):
        """ Refine a pair of pseudo-parallel sentences.
        """
        current_wmd = nlp_util.get_word_mover_dist(src, current_tgt)
        candidate_wmd = nlp_util.get_word_mover_dist(src, candidate_tgt)

        new_target, new_target_wmd = (current_tgt, current_wmd) if current_wmd < candidate_wmd \
            else (candidate_tgt, candidate_wmd)

        most_similar_original = corpus.get_most_similar_sentence(new_target,
                                                                 threshold=0)
        original_wmd = nlp_util.get_word_mover_dist(src, most_similar_original)

        return new_target if new_target_wmd < original_wmd else most_similar_original

    src_sentences = file_util.read_file(src_parallel_path)
    tgt_sentences = file_util.read_file(tgt_parallel_path)

    candidates = file_util.read_json(candidate_path)
    pseudo_parallel_corpus = {
        src_sentences[i]: tgt_sentences[i]
        for i in range(len(src_sentences))
    }

    progress_bar = tqdm(total=len(src_sentences))
    refined_count = 0
    thread_lock = threading.Lock()

    def run_thread(src_batch_, tgt_batch_):
        nonlocal progress_bar
        nonlocal refined_count

        for j in range(len(src_batch_)):
            source_sentence = src_batch_[j]
            current_target = tgt_batch_[j]

            candidate = candidates[source_sentence][0]
            refined_target = get_refined_target(source_sentence,
                                                current_target, candidate)

            if pseudo_parallel_corpus[source_sentence] != refined_target:
                with thread_lock:
                    refined_count += 1

                pseudo_parallel_corpus[source_sentence] = refined_target

            with thread_lock:
                progress_bar.update()

    threads = list()
    num_threads = 5
    batch_size = math.ceil(len(src_sentences) / num_threads)
    for i in range(0, len(src_sentences), batch_size):
        src_batch = src_sentences[i:i + batch_size]
        tgt_batch = tgt_sentences[i:i + batch_size]

        thread = threading.Thread(target=run_thread,
                                  args=(src_batch, tgt_batch))
        threads.append(thread)
        thread.start()

    for thread in threads:
        thread.join()

    progress_bar.close()

    file_util.write_file(pseudo_parallel_corpus.keys(), src_output_path)
    file_util.write_file(pseudo_parallel_corpus.values(), tgt_output_path)

    update_rate = refined_count / len(pseudo_parallel_corpus)
    print(f"Number of pairs refined: {refined_count} ({update_rate * 100}%)")

    return update_rate