def test_morph_init(self): tmp_dir, f1, f2 = morph_utils.get_two_different_tmp_files() ibm_model = CharIBMModel1() ibm_model.initialize_translation_probs(f1, f2) assert ( ibm_model.translation_prob[ibm_model.str2int("5")][ ibm_model.str2int("d" + ibm_model.eow_symbol) ] > 0 ) assert len(ibm_model.translation_prob) == 80 ibm_model = Word2CharIBMModel1(max_subword_len=4) ibm_model.initialize_translation_probs(f1, f2) assert ( ibm_model.str2int("abcdefghi") not in ibm_model.translation_prob[ibm_model.str2int("123456789")] ) assert ( ibm_model.str2int("cdef") in ibm_model.translation_prob[ibm_model.str2int("123456789")] ) assert ( ibm_model.str2int("cde") in ibm_model.translation_prob[ibm_model.str2int("123456789")] ) assert len(ibm_model.translation_prob[ibm_model.str2int("123456789")]) == 34 assert len(ibm_model.translation_prob) == 10 shutil.rmtree(tmp_dir)
def test_build_bilingual_vocab(self): bpe_model = bilingual_bpe.BilingualBPE() tmp_dir, f1, f2 = morph_utils.get_two_different_tmp_files() vocab_size = bpe_model.build_vocab( src_txt_path=f1, dst_txt_path=f2, vocab_size=12, num_ibm_iters=3, num_cpus=3 ) assert vocab_size == len(bpe_model.vocab) == 12 shutil.rmtree(tmp_dir)
def test_best_candidate_bilingual(self): bpe_model = bilingual_bpe.BilingualBPE() tmp_dir, f1, f2 = morph_utils.get_two_different_tmp_files() bpe_model._init_params(src_txt_path=f1, dst_txt_path=f2, num_ibm_iters=3, num_cpus=3) b1 = bpe_model.get_best_candidate() c1 = bpe_model.get_best_candidate() # For the best step, it is the same as monolingual. assert b1 == c1 shutil.rmtree(tmp_dir)
def test_build_bilingual_vocab(self): bpe_model = bilingual_bpe.BilingualBPE() tmp_dir, f1, f2 = morph_utils.get_two_different_tmp_files() dst2src_ibm_model = char_ibm_model1.Word2CharIBMModel1() dst2src_ibm_model.learn_ibm_parameters(src_path=f2, dst_path=f1, num_iters=3) ibm_path = path.join(tmp_dir, "ibm") dst2src_ibm_model.save(file_path=ibm_path) vocab_size = bpe_model.build_vocab( ibm_model_path=ibm_path, src_txt_path=f1, dst_txt_path=f2, vocab_size=12 ) assert vocab_size == len(bpe_model.vocab) == 12 shutil.rmtree(tmp_dir)
def test_bilingual_bpe_init(self): """ This looks more like an integration test because each subpeace is tested in different places. """ bpe_model = bilingual_bpe.BilingualBPE() tmp_dir, f1, f2 = morph_utils.get_two_different_tmp_files() bpe_model._init_params( src_txt_path=f1, dst_txt_path=f2, num_ibm_iters=3, num_cpus=3 ) assert len(bpe_model.bpe_probs_from_alignment) == 80 assert bpe_model.eow_symbol in bpe_model.bpe_probs_from_alignment shutil.rmtree(tmp_dir)
def test_morph_init(self): tmp_dir, f1, f2 = morph_utils.get_two_different_tmp_files() ibm_model = CharIBMModel1() ibm_model.initialize_translation_probs(f1, f2) assert ibm_model.translation_prob["5"]["d" + ibm_model.eow_symbol] > 0 assert len(ibm_model.translation_prob) == 83 assert len(ibm_model.training_data) == 4 ibm_model = Word2CharIBMModel1(max_subword_len=4) ibm_model.initialize_translation_probs(f1, f2) assert "abcdefghi" not in ibm_model.translation_prob["123456789"] assert "cdef" in ibm_model.translation_prob["123456789"] assert "cde" in ibm_model.translation_prob["123456789"] assert len(ibm_model.translation_prob["123456789"]) == 34 assert len(ibm_model.translation_prob) == 9 assert len(ibm_model.training_data) == 4 shutil.rmtree(tmp_dir)
def test_best_candidate_bilingual(self): bpe_model = bilingual_bpe.BilingualBPE() tmp_dir, f1, f2 = morph_utils.get_two_different_tmp_files() dst2src_ibm_model = char_ibm_model1.Word2CharIBMModel1() dst2src_ibm_model.learn_ibm_parameters(src_path=f2, dst_path=f1, num_iters=3) ibm_path = path.join(tmp_dir, "ibm") dst2src_ibm_model.save(file_path=ibm_path) bpe_model._init_params( ibm_model_path=ibm_path, src_txt_path=f1, dst_txt_path=f2 ) b1 = bpe_model.get_best_candidate() c1 = bpe_model.get_best_candidate() # For the best step, it is the same as monolingual. assert b1 == c1 shutil.rmtree(tmp_dir)
def test_bilingual_bpe_init(self): """ This looks more like an integration test because each subpeace is tested in different places. """ bpe_model = bilingual_bpe.BilingualBPE() tmp_dir, f1, f2 = morph_utils.get_two_different_tmp_files() dst2src_ibm_model = char_ibm_model1.Word2CharIBMModel1() dst2src_ibm_model.learn_ibm_parameters(src_path=f2, dst_path=f1, num_iters=3) ibm_path = path.join(tmp_dir, "ibm") dst2src_ibm_model.save(file_path=ibm_path) bpe_model._init_params( ibm_model_path=ibm_path, src_txt_path=f1, dst_txt_path=f2 ) assert len(bpe_model.bpe_probs_from_alignment) == 80 assert bpe_model.eow_symbol in bpe_model.bpe_probs_from_alignment shutil.rmtree(tmp_dir)
def test_save_load(self): bpe_model = bilingual_bpe.BilingualBPE() tmp_dir, f1, f2 = morph_utils.get_two_different_tmp_files() dst2src_ibm_model = char_ibm_model1.Word2CharIBMModel1() dst2src_ibm_model.learn_ibm_parameters(src_path=f2, dst_path=f1, num_iters=3) ibm_path = path.join(tmp_dir, "ibm") dst2src_ibm_model.save(file_path=ibm_path) vocab_size = bpe_model.build_vocab( ibm_model_path=ibm_path, src_txt_path=f1, dst_txt_path=f2, vocab_size=12 ) assert vocab_size == len(bpe_model.vocab) == 12 bpe_model.save(file_path=tmp_dir + "/vocab.txt") loaded_model = bilingual_bpe.BilingualBPE() loaded_model.load(file_path=tmp_dir + "/vocab.txt") assert loaded_model.vocab == bpe_model.vocab assert bpe_model.segment_word("1234") == loaded_model.segment_word("1234") shutil.rmtree(tmp_dir)