def test_likeness_probs(self): morph_hmm_model = unsupervised_morphology.MorphologyHMMParams( use_morph_likeness=True) with patch("builtins.open") as mock_open: txt_content = [ "123 124 234 345", "112 122 123 345", "123456789", "123456 456789", ] mock_open.return_value.__enter__ = mock_open mock_open.return_value.__iter__ = Mock( return_value=iter(txt_content)) morph_hmm_model.init_uniform_params_from_data("no_exist_file.txt") float_precision = 3 assert round(morph_hmm_model.morph_likeness["prefix"]["23"], 3) == round(0.016515200181119086, float_precision) assert round(morph_hmm_model.morph_likeness["stem"]["23"], 3) == round(0.9824677845172917, float_precision) assert round(morph_hmm_model.morph_likeness["suffix"]["23"], 3) == round(0.0010170153015892497, float_precision) assert round(morph_hmm_model.morph_likeness["prefix"]["789"], 3) == round(0.00024574366719647703, float_precision) assert round(morph_hmm_model.morph_likeness["stem"]["789"], 3) == round(0.9957636518152019, float_precision) assert round(morph_hmm_model.morph_likeness["suffix"]["789"], 3) == round(0.003990604517601711, float_precision)
def test_zero_out_params(self): morph_hmm_model = unsupervised_morphology.MorphologyHMMParams() with patch("builtins.open") as mock_open: txt_content = [ "123 124 234 345", "112 122 123 345", "123456789", "123456 456789", ] mock_open.return_value.__enter__ = mock_open mock_open.return_value.__iter__ = Mock( return_value=iter(txt_content)) morph_hmm_model.init_uniform_params_from_data("no_exist_file.txt") for tag in morph_hmm_model.morph_emit_probs.keys(): for morph in morph_hmm_model.morph_emit_probs[tag].keys(): assert morph_hmm_model.morph_emit_probs[tag][morph] > 0 morph_hmm_model.zero_out_parmas() for tag in morph_hmm_model.morph_emit_probs.keys(): for morph in morph_hmm_model.morph_emit_probs[tag].keys(): assert morph_hmm_model.morph_emit_probs[tag][morph] == 0 for prev_tag in morph_hmm_model.affix_trans_probs.keys(): for tag in morph_hmm_model.affix_trans_probs[prev_tag].keys(): assert morph_hmm_model.affix_trans_probs[prev_tag][ tag] == 0
def test_morph_init(self): morph_hmm_model = unsupervised_morphology.MorphologyHMMParams() with patch("builtins.open") as mock_open: txt_content = [ "123 124 234 345", "112 122 123 345", "123456789", "123456 456789", ] mock_open.return_value.__enter__ = mock_open mock_open.return_value.__iter__ = Mock( return_value=iter(txt_content)) morph_hmm_model.init_uniform_params_from_data("no_exist_file.txt") assert len(morph_hmm_model.morph_emit_probs["prefix"]) == 28 assert "9" not in morph_hmm_model.morph_emit_probs["prefix"] assert len(morph_hmm_model.morph_emit_probs["stem"]) == 42 assert "689" not in morph_hmm_model.morph_emit_probs["stem"] assert len(morph_hmm_model.morph_emit_probs["suffix"]) == 29 assert "1" not in morph_hmm_model.morph_emit_probs["suffix"] assert morph_hmm_model.morph_emit_probs["stem"]["1234"] == 1.1 / ( 42 * 1.1) assert morph_hmm_model.affix_trans_probs["START"]["START"] == 0 assert morph_hmm_model.affix_trans_probs["START"]["prefix"] == 0.5 assert morph_hmm_model.affix_trans_probs["START"]["stem"] == 0.5 assert morph_hmm_model.affix_trans_probs["START"]["suffix"] == 0 assert morph_hmm_model.affix_trans_probs["START"]["END"] == 0 assert morph_hmm_model.affix_trans_probs["prefix"]["START"] == 0 assert morph_hmm_model.affix_trans_probs["prefix"]["prefix"] == 0.5 assert morph_hmm_model.affix_trans_probs["prefix"]["stem"] == 0.5 assert morph_hmm_model.affix_trans_probs["prefix"]["suffix"] == 0 assert morph_hmm_model.affix_trans_probs["prefix"]["END"] == 0 assert morph_hmm_model.affix_trans_probs["stem"]["START"] == 0 assert morph_hmm_model.affix_trans_probs["stem"]["prefix"] == 0 assert morph_hmm_model.affix_trans_probs["stem"]["stem"] == 1.0 / 3 assert morph_hmm_model.affix_trans_probs["stem"][ "suffix"] == 1.0 / 3 assert morph_hmm_model.affix_trans_probs["stem"]["END"] == 1.0 / 3 assert morph_hmm_model.affix_trans_probs["suffix"]["START"] == 0 assert morph_hmm_model.affix_trans_probs["suffix"]["prefix"] == 0 assert morph_hmm_model.affix_trans_probs["suffix"]["stem"] == 0 assert morph_hmm_model.affix_trans_probs["suffix"]["suffix"] == 0.5 assert morph_hmm_model.affix_trans_probs["suffix"]["END"] == 0.5 assert morph_hmm_model.affix_trans_probs["END"]["START"] == 0 assert morph_hmm_model.affix_trans_probs["END"]["prefix"] == 0 assert morph_hmm_model.affix_trans_probs["END"]["stem"] == 0 assert morph_hmm_model.affix_trans_probs["END"]["suffix"] == 0 assert morph_hmm_model.affix_trans_probs["END"]["END"] == 0
def test_segment_viterbi_w_smoothing(self): morph_hmm_model = unsupervised_morphology.MorphologyHMMParams() with patch("builtins.open") as mock_open: txt_content = [ "123 124 234 345", "112 122 123 345", "123456789", "123456 456789", ] mock_open.return_value.__enter__ = mock_open mock_open.return_value.__iter__ = Mock(return_value=iter(txt_content)) morph_hmm_model.init_params_from_data("no_exist_file.txt") segmentor = unsupervised_morphology.MorphologySegmentor(morph_hmm_model) assert segmentor.segment_viterbi("123123789") == [0, 2, 3, 5, 6, 9]
def test_emission_probs(self): morph_hmm_model = unsupervised_morphology.MorphologyHMMParams() with patch("builtins.open") as mock_open: txt_content = [ "123 124 234 345", "112 122 123 345", "123456789", "123456 456789", ] mock_open.return_value.__enter__ = mock_open mock_open.return_value.__iter__ = Mock(return_value=iter(txt_content)) morph_hmm_model.init_params_from_data("no_exist_file.txt") assert morph_hmm_model.emission_prob("stem", "1234") == 1.1 / (42 * 1.1) assert morph_hmm_model.emission_prob("suffix", "1") == 0.1 / (29 * 1.1) assert morph_hmm_model.emission_prob("END", "1") == 0
def test_emission_probs(self): morph_hmm_model = unsupervised_morphology.MorphologyHMMParams() with patch("builtins.open") as mock_open: txt_content = [ "123 124 234 345", "112 122 123 345", "123456789", "123456 456789", ] mock_open.return_value.__enter__ = mock_open mock_open.return_value.__iter__ = Mock(return_value=iter(txt_content)) morph_hmm_model.init_params_from_data("no_exist_file.txt") # todo add more tests e = 0.014141414141414142 e_r = e * math.exp(-9) assert round(morph_hmm_model.emission_prob("1234"), 3) == round(e_r, 3)
def test_morph_init(self): morph_hmm_model = unsupervised_morphology.MorphologyHMMParams() with patch("builtins.open") as mock_open: txt_content = [ "123 124 234 345", "112 122 123 345", "123456789", "123456 456789", ] mock_open.return_value.__enter__ = mock_open mock_open.return_value.__iter__ = Mock(return_value=iter(txt_content)) morph_hmm_model.init_params_from_data("no_exist_file.txt") assert len(morph_hmm_model.morph_emit_probs) == 51 assert round(morph_hmm_model.morph_emit_probs["1234"], 3) == round( 0.014141414141414142, 3 )
def test_transition_log_probs(self): morph_hmm_model = unsupervised_morphology.MorphologyHMMParams() with patch("builtins.open") as mock_open: txt_content = [ "123 124 234 345", "112 122 123 345", "123456789", "123456 456789", ] mock_open.return_value.__enter__ = mock_open mock_open.return_value.__iter__ = Mock( return_value=iter(txt_content)) morph_hmm_model.init_uniform_params_from_data("no_exist_file.txt") assert morph_hmm_model.transition_log_prob( "stem", "END") == math.log(1.0 / 3) assert (morph_hmm_model.transition_log_prob( "suffix", "START") == morph_hmm_model.SMALL_CONST)
def test_segment_viterbi_no_smoothing(self): morph_hmm_model = unsupervised_morphology.MorphologyHMMParams( smoothing_const=0.0, use_morph_likeness=False) with patch("builtins.open") as mock_open: txt_content = [ "123 124 234 345", "112 122 123 345", "123456789", "123456 456789", ] mock_open.return_value.__enter__ = mock_open mock_open.return_value.__iter__ = Mock( return_value=iter(txt_content)) morph_hmm_model.init_uniform_params_from_data("no_exist_file.txt") segmentor = unsupervised_morphology.MorphologySegmentor( morph_hmm_model) assert segmentor.segment_viterbi("123123789") == ( ["prefix", "prefix", "stem"], [0, 3, 6, 9], )
def test_emission_log_probs(self): morph_hmm_model = unsupervised_morphology.MorphologyHMMParams( use_morph_likeness=False) with patch("builtins.open") as mock_open: txt_content = [ "123 124 234 345", "112 122 123 345", "123456789", "123456 456789", ] mock_open.return_value.__enter__ = mock_open mock_open.return_value.__iter__ = Mock( return_value=iter(txt_content)) morph_hmm_model.init_uniform_params_from_data("no_exist_file.txt") assert morph_hmm_model.emission_log_probs( "stem", "1234") == math.log(1.1 / (42 * 1.1)) assert morph_hmm_model.emission_log_probs( "suffix", "1") == math.log(0.1 / (29 * 1.1)) assert (morph_hmm_model.emission_log_probs( "END", "1") == morph_hmm_model.SMALL_CONST)
def test_morph_normal_init(self): """ Check if normal initilization does not break. """ stems = ["jump", "say", "work", "play"] prefixes = ["re"] suffixes = ["ing", "s", "ed"] txt_content = [] for _ in range(1000): p, stem, s = "", "", "" if random.randint(1, 5) > 2: p_i = random.randint(0, len(prefixes) - 1) p = prefixes[p_i] if random.randint(1, 5) > 2: s_i = random.randint(0, len(suffixes) - 1) s = suffixes[s_i] stem_i = random.randint(0, len(stems) - 1) stem = stems[stem_i] txt_content.append(p + stem + s) morph_hmm_model = unsupervised_morphology.MorphologyHMMParams( use_morph_likeness=False) with patch("builtins.open") as mock_open: mock_open.return_value.__enter__ = mock_open mock_open.return_value.__iter__ = Mock( return_value=iter(txt_content)) morph_hmm_model.init_params_with_normal_distribution( "no_exist_file.txt") assert morph_hmm_model.affix_trans_probs["END"]["START"] == 0 assert morph_hmm_model.affix_trans_probs["END"]["prefix"] == 0 assert morph_hmm_model.affix_trans_probs["END"]["stem"] == 0 assert morph_hmm_model.affix_trans_probs["END"]["suffix"] == 0 assert morph_hmm_model.affix_trans_probs["END"]["END"] == 0 assert morph_hmm_model.affix_trans_probs["START"]["START"] == 0 assert morph_hmm_model.affix_trans_probs["START"]["suffix"] == 0 assert morph_hmm_model.affix_trans_probs["START"]["END"] == 0
def test_segment_word_no_smoothing(self): morph_hmm_model = unsupervised_morphology.MorphologyHMMParams( smoothing_const=0.0, use_morph_likeness=False) with patch("builtins.open") as mock_open: txt_content = [ "123 124 234 345", "112 122 123 345", "123456789", "123456 456789", ] mock_open.return_value.__enter__ = mock_open mock_open.return_value.__iter__ = Mock( return_value=iter(txt_content)) morph_hmm_model.init_uniform_params_from_data("no_exist_file.txt") segmentor = unsupervised_morphology.MorphologySegmentor( morph_hmm_model) assert segmentor.segment_word("123123789789") == "123 123 789 789" assert (segmentor.segment_word( "123123789789", add_affix_symbols=True) == "123+ 123+ 789 +789") assert segmentor.segment_word("123") == segmentor.segment_word( "123", add_affix_symbols=True)