def test_morph_init(self):
        tmp_dir, f1, f2 = morph_utils.get_two_different_tmp_files()

        ibm_model = CharIBMModel1()
        ibm_model.initialize_translation_probs(f1, f2)
        assert (
            ibm_model.translation_prob[ibm_model.str2int("5")][
                ibm_model.str2int("d" + ibm_model.eow_symbol)
            ]
            > 0
        )
        assert len(ibm_model.translation_prob) == 80

        ibm_model = Word2CharIBMModel1(max_subword_len=4)
        ibm_model.initialize_translation_probs(f1, f2)
        assert (
            ibm_model.str2int("abcdefghi")
            not in ibm_model.translation_prob[ibm_model.str2int("123456789")]
        )
        assert (
            ibm_model.str2int("cdef")
            in ibm_model.translation_prob[ibm_model.str2int("123456789")]
        )
        assert (
            ibm_model.str2int("cde")
            in ibm_model.translation_prob[ibm_model.str2int("123456789")]
        )
        assert len(ibm_model.translation_prob[ibm_model.str2int("123456789")]) == 34
        assert len(ibm_model.translation_prob) == 10

        shutil.rmtree(tmp_dir)
Esempio n. 2
0
 def test_build_bilingual_vocab(self):
     bpe_model = bilingual_bpe.BilingualBPE()
     tmp_dir, f1, f2 = morph_utils.get_two_different_tmp_files()
     vocab_size = bpe_model.build_vocab(
         src_txt_path=f1, dst_txt_path=f2, vocab_size=12, num_ibm_iters=3, num_cpus=3
     )
     assert vocab_size == len(bpe_model.vocab) == 12
     shutil.rmtree(tmp_dir)
Esempio n. 3
0
    def test_best_candidate_bilingual(self):
        bpe_model = bilingual_bpe.BilingualBPE()
        tmp_dir, f1, f2 = morph_utils.get_two_different_tmp_files()
        bpe_model._init_params(src_txt_path=f1,
                               dst_txt_path=f2,
                               num_ibm_iters=3,
                               num_cpus=3)

        b1 = bpe_model.get_best_candidate()
        c1 = bpe_model.get_best_candidate()
        # For the best step, it is the same as monolingual.
        assert b1 == c1

        shutil.rmtree(tmp_dir)
Esempio n. 4
0
    def test_build_bilingual_vocab(self):
        bpe_model = bilingual_bpe.BilingualBPE()
        tmp_dir, f1, f2 = morph_utils.get_two_different_tmp_files()

        dst2src_ibm_model = char_ibm_model1.Word2CharIBMModel1()
        dst2src_ibm_model.learn_ibm_parameters(src_path=f2, dst_path=f1, num_iters=3)
        ibm_path = path.join(tmp_dir, "ibm")
        dst2src_ibm_model.save(file_path=ibm_path)

        vocab_size = bpe_model.build_vocab(
            ibm_model_path=ibm_path, src_txt_path=f1, dst_txt_path=f2, vocab_size=12
        )
        assert vocab_size == len(bpe_model.vocab) == 12
        shutil.rmtree(tmp_dir)
Esempio n. 5
0
    def test_bilingual_bpe_init(self):
        """
            This looks more like an integration test because each subpeace is tested
            in different places.
        """
        bpe_model = bilingual_bpe.BilingualBPE()
        tmp_dir, f1, f2 = morph_utils.get_two_different_tmp_files()
        bpe_model._init_params(
            src_txt_path=f1, dst_txt_path=f2, num_ibm_iters=3, num_cpus=3
        )
        assert len(bpe_model.bpe_probs_from_alignment) == 80
        assert bpe_model.eow_symbol in bpe_model.bpe_probs_from_alignment

        shutil.rmtree(tmp_dir)
    def test_morph_init(self):
        tmp_dir, f1, f2 = morph_utils.get_two_different_tmp_files()

        ibm_model = CharIBMModel1()
        ibm_model.initialize_translation_probs(f1, f2)
        assert ibm_model.translation_prob["5"]["d" + ibm_model.eow_symbol] > 0
        assert len(ibm_model.translation_prob) == 83
        assert len(ibm_model.training_data) == 4

        ibm_model = Word2CharIBMModel1(max_subword_len=4)
        ibm_model.initialize_translation_probs(f1, f2)
        assert "abcdefghi" not in ibm_model.translation_prob["123456789"]
        assert "cdef" in ibm_model.translation_prob["123456789"]
        assert "cde" in ibm_model.translation_prob["123456789"]
        assert len(ibm_model.translation_prob["123456789"]) == 34
        assert len(ibm_model.translation_prob) == 9
        assert len(ibm_model.training_data) == 4

        shutil.rmtree(tmp_dir)
Esempio n. 7
0
    def test_best_candidate_bilingual(self):
        bpe_model = bilingual_bpe.BilingualBPE()
        tmp_dir, f1, f2 = morph_utils.get_two_different_tmp_files()

        dst2src_ibm_model = char_ibm_model1.Word2CharIBMModel1()
        dst2src_ibm_model.learn_ibm_parameters(src_path=f2, dst_path=f1, num_iters=3)
        ibm_path = path.join(tmp_dir, "ibm")
        dst2src_ibm_model.save(file_path=ibm_path)

        bpe_model._init_params(
            ibm_model_path=ibm_path, src_txt_path=f1, dst_txt_path=f2
        )

        b1 = bpe_model.get_best_candidate()
        c1 = bpe_model.get_best_candidate()
        # For the best step, it is the same as monolingual.
        assert b1 == c1

        shutil.rmtree(tmp_dir)
Esempio n. 8
0
    def test_bilingual_bpe_init(self):
        """
            This looks more like an integration test because each subpeace is tested
            in different places.
        """
        bpe_model = bilingual_bpe.BilingualBPE()
        tmp_dir, f1, f2 = morph_utils.get_two_different_tmp_files()
        dst2src_ibm_model = char_ibm_model1.Word2CharIBMModel1()
        dst2src_ibm_model.learn_ibm_parameters(src_path=f2, dst_path=f1, num_iters=3)
        ibm_path = path.join(tmp_dir, "ibm")
        dst2src_ibm_model.save(file_path=ibm_path)

        bpe_model._init_params(
            ibm_model_path=ibm_path, src_txt_path=f1, dst_txt_path=f2
        )
        assert len(bpe_model.bpe_probs_from_alignment) == 80
        assert bpe_model.eow_symbol in bpe_model.bpe_probs_from_alignment

        shutil.rmtree(tmp_dir)
Esempio n. 9
0
    def test_save_load(self):
        bpe_model = bilingual_bpe.BilingualBPE()
        tmp_dir, f1, f2 = morph_utils.get_two_different_tmp_files()

        dst2src_ibm_model = char_ibm_model1.Word2CharIBMModel1()
        dst2src_ibm_model.learn_ibm_parameters(src_path=f2, dst_path=f1, num_iters=3)
        ibm_path = path.join(tmp_dir, "ibm")
        dst2src_ibm_model.save(file_path=ibm_path)

        vocab_size = bpe_model.build_vocab(
            ibm_model_path=ibm_path, src_txt_path=f1, dst_txt_path=f2, vocab_size=12
        )
        assert vocab_size == len(bpe_model.vocab) == 12

        bpe_model.save(file_path=tmp_dir + "/vocab.txt")

        loaded_model = bilingual_bpe.BilingualBPE()
        loaded_model.load(file_path=tmp_dir + "/vocab.txt")

        assert loaded_model.vocab == bpe_model.vocab
        assert bpe_model.segment_word("1234") == loaded_model.segment_word("1234")

        shutil.rmtree(tmp_dir)