def test_morph_init(self): ibm_model = IBMModel1() tmp_dir, f1, f2 = get_two_tmp_files() ibm_model.initialize_translation_probs(f1, f2) assert len(ibm_model.translation_prob) == 10 assert len(ibm_model.translation_prob[ibm_model.null_str]) == 9 assert len(ibm_model.translation_prob["345"]) == 6 assert ibm_model.translation_prob["122"]["123"] == 1.0 / 4 shutil.rmtree(tmp_dir)
def test_str2int(self): ibm_model = IBMModel1() # Calling multiple times to make sure we get the same value. assert ibm_model.str2int("hello") == 1 assert ibm_model.str2int("bye") == 2 assert ibm_model.str2int("hello") == 1 assert ibm_model.str2int("bye") == 2 assert len(ibm_model._str2int) == 3 assert len(ibm_model._int2str) == 3 assert ibm_model._int2str == [ibm_model.null_str, "hello", "bye"] assert ibm_model.int2str(2) == "bye"
def test_ibm_train(self): ibm_model = IBMModel1() tmp_dir, f1, f2 = morph_utils.get_two_tmp_files() ibm_model.learn_ibm_parameters(src_path=f1, dst_path=f2, num_iters=3) assert ibm_model.translation_prob["456789"]["345"] == 0 assert ibm_model.translation_prob["456789"]["456789"] == 0.5 assert ( ibm_model.translation_prob[ibm_model.null_str]["124"] < ibm_model.translation_prob[ibm_model.null_str]["456789"] ) shutil.rmtree(tmp_dir)
def test_e_step(self): ibm_model = IBMModel1() tmp_dir, f1, f2 = morph_utils.get_two_tmp_files() ibm_model.initialize_translation_probs(f1, f2) translation_counts = defaultdict(lambda: defaultdict(float)) ibm_model.e_step( ["123", "124", "234", "345", ibm_model.null_str], ["123", "124", "234", "345"], translation_counts, ) assert translation_counts["123"]["345"] == 1.0 / 4 shutil.rmtree(tmp_dir)
def test_em_step(self): ibm_model = IBMModel1() tmp_dir, f1, f2 = morph_utils.get_two_same_tmp_files() ibm_model.initialize_translation_probs(f1, f2) pool = Pool(3) ibm_model.em_step(src_path=f1, dst_path=f2, num_cpus=3, pool=pool) assert ibm_model.translation_prob["456789"]["345"] == 0 assert ibm_model.translation_prob["456789"]["456789"] == 0.5 assert (ibm_model.translation_prob[ibm_model.null_str]["124"] < ibm_model.translation_prob[ibm_model.null_str]["456789"]) shutil.rmtree(tmp_dir)
def test_em_step(self): ibm_model = IBMModel1() tmp_dir, f1, f2 = morph_utils.get_two_tmp_files() ibm_model.initialize_translation_probs(f1, f2) ibm_model.em_step(f1, f2) assert ibm_model.translation_prob["456789"]["345"] == 0 assert ibm_model.translation_prob["456789"]["456789"] == 0.5 assert ( ibm_model.translation_prob[ibm_model.null_str]["124"] < ibm_model.translation_prob[ibm_model.null_str]["456789"] ) shutil.rmtree(tmp_dir)
def test_morph_init(self): ibm_model = IBMModel1() tmp_dir, f1, f2 = morph_utils.get_two_same_tmp_files() ibm_model.initialize_translation_probs(f1, f2) assert len(ibm_model.translation_prob) == 10 assert ( len(ibm_model.translation_prob[ibm_model.str2int(ibm_model.null_str)]) == 9 ) assert len(ibm_model.translation_prob[ibm_model.str2int("345")]) == 6 assert ( ibm_model.translation_prob[ibm_model.str2int("122")][ ibm_model.str2int("123") ] == 1.0 / 4 ) shutil.rmtree(tmp_dir)
def test_ibm_train(self): ibm_model = IBMModel1() tmp_dir, f1, f2 = morph_utils.get_two_same_tmp_files() ibm_model.learn_ibm_parameters(src_path=f1, dst_path=f2, num_iters=3) assert ( ibm_model.translation_prob[ibm_model.str2int("456789")][ ibm_model.str2int("345") ] == 0 ) assert ( ibm_model.translation_prob[ibm_model.str2int("456789")][ ibm_model.str2int("456789") ] == 0.5 ) shutil.rmtree(tmp_dir)
def test_expectation_for_one_sentence(self): ibm_model = IBMModel1() tmp_dir, f1, f2 = morph_utils.get_two_same_tmp_files() ibm_model.initialize_translation_probs(f1, f2) translation_counts = defaultdict(lambda: defaultdict(float)) ibm_model.expectation_for_one_sentence( Counter( ibm_model.str2int(w) for w in ["123", "124", "234", "345", ibm_model.null_str] ), Counter(ibm_model.str2int(w) for w in ["123", "124", "234", "345"]), translation_counts, ) assert ( round( translation_counts[ibm_model.str2int("123")][ibm_model.str2int("345")], 3, ) == 0.176 ) shutil.rmtree(tmp_dir)