def test_segment_viterbi_w_smoothing(self): morph_hmm_model = unsupervised_morphology.MorphologyHMMParams() with patch("builtins.open") as mock_open: txt_content = [ "123 124 234 345", "112 122 123 345", "123456789", "123456 456789", ] mock_open.return_value.__enter__ = mock_open mock_open.return_value.__iter__ = Mock(return_value=iter(txt_content)) morph_hmm_model.init_params_from_data("no_exist_file.txt") segmentor = unsupervised_morphology.MorphologySegmentor(morph_hmm_model) assert segmentor.segment_viterbi("123123789") == [0, 2, 3, 5, 6, 9]
def test_segment_viterbi_no_smoothing(self): morph_hmm_model = unsupervised_morphology.MorphologyHMMParams( smoothing_const=0.0, use_morph_likeness=False) with patch("builtins.open") as mock_open: txt_content = [ "123 124 234 345", "112 122 123 345", "123456789", "123456 456789", ] mock_open.return_value.__enter__ = mock_open mock_open.return_value.__iter__ = Mock( return_value=iter(txt_content)) morph_hmm_model.init_uniform_params_from_data("no_exist_file.txt") segmentor = unsupervised_morphology.MorphologySegmentor( morph_hmm_model) assert segmentor.segment_viterbi("123123789") == ( ["prefix", "prefix", "stem"], [0, 3, 6, 9], )
def test_segment_word_no_smoothing(self): morph_hmm_model = unsupervised_morphology.MorphologyHMMParams( smoothing_const=0.0, use_morph_likeness=False) with patch("builtins.open") as mock_open: txt_content = [ "123 124 234 345", "112 122 123 345", "123456789", "123456 456789", ] mock_open.return_value.__enter__ = mock_open mock_open.return_value.__iter__ = Mock( return_value=iter(txt_content)) morph_hmm_model.init_uniform_params_from_data("no_exist_file.txt") segmentor = unsupervised_morphology.MorphologySegmentor( morph_hmm_model) assert segmentor.segment_word("123123789789") == "123 123 789 789" assert (segmentor.segment_word( "123123789789", add_affix_symbols=True) == "123+ 123+ 789 +789") assert segmentor.segment_word("123") == segmentor.segment_word( "123", add_affix_symbols=True)
length_slope=options.length_slope, ) print("Number of training words", len(model.params.word_counts)) model.expectation_maximization( options.em_iter, options.num_cpus, options.model_path if options.save_checkpoint else None, ) if not options.save_checkpoint: model.params.save(options.model_path) if (options.input_file is not None and options.output_file is not None and options.model_path is not None): model = unsupervised_morphology.MorphologyHMMParams.load( options.model_path) segmentor = unsupervised_morphology.MorphologySegmentor(model) segment_cache = {} writer = open(options.output_file, "w", encoding="utf-8") with open(options.input_file, "r", encoding="utf-8") as input_stream: for line in input_stream: output = [] for word in line.strip().split(): if word not in segment_cache: segmented = segmentor.segment_word( word, add_affix_symbols=options.add_affix_symbols) segment_cache[word] = segmented output.append(segment_cache[word]) writer.write(" ".join(output) + "\n") writer.close() if options.investigate and options.model_path is not None: