コード例 #1
0
    def test_segment_viterbi_w_smoothing(self):
        morph_hmm_model = unsupervised_morphology.MorphologyHMMParams()
        with patch("builtins.open") as mock_open:
            txt_content = [
                "123 124 234 345",
                "112 122 123 345",
                "123456789",
                "123456 456789",
            ]
            mock_open.return_value.__enter__ = mock_open
            mock_open.return_value.__iter__ = Mock(return_value=iter(txt_content))
            morph_hmm_model.init_params_from_data("no_exist_file.txt")

            segmentor = unsupervised_morphology.MorphologySegmentor(morph_hmm_model)
            assert segmentor.segment_viterbi("123123789") == [0, 2, 3, 5, 6, 9]
コード例 #2
0
    def test_segment_viterbi_no_smoothing(self):
        morph_hmm_model = unsupervised_morphology.MorphologyHMMParams(
            smoothing_const=0.0, use_morph_likeness=False)
        with patch("builtins.open") as mock_open:
            txt_content = [
                "123 124 234 345",
                "112 122 123 345",
                "123456789",
                "123456 456789",
            ]
            mock_open.return_value.__enter__ = mock_open
            mock_open.return_value.__iter__ = Mock(
                return_value=iter(txt_content))
            morph_hmm_model.init_uniform_params_from_data("no_exist_file.txt")

            segmentor = unsupervised_morphology.MorphologySegmentor(
                morph_hmm_model)
            assert segmentor.segment_viterbi("123123789") == (
                ["prefix", "prefix", "stem"],
                [0, 3, 6, 9],
            )
コード例 #3
0
    def test_segment_word_no_smoothing(self):
        morph_hmm_model = unsupervised_morphology.MorphologyHMMParams(
            smoothing_const=0.0, use_morph_likeness=False)
        with patch("builtins.open") as mock_open:
            txt_content = [
                "123 124 234 345",
                "112 122 123 345",
                "123456789",
                "123456 456789",
            ]
            mock_open.return_value.__enter__ = mock_open
            mock_open.return_value.__iter__ = Mock(
                return_value=iter(txt_content))
            morph_hmm_model.init_uniform_params_from_data("no_exist_file.txt")

            segmentor = unsupervised_morphology.MorphologySegmentor(
                morph_hmm_model)
            assert segmentor.segment_word("123123789789") == "123 123 789 789"
            assert (segmentor.segment_word(
                "123123789789",
                add_affix_symbols=True) == "123+ 123+ 789 +789")
            assert segmentor.segment_word("123") == segmentor.segment_word(
                "123", add_affix_symbols=True)
コード例 #4
0
            length_slope=options.length_slope,
        )
        print("Number of training words", len(model.params.word_counts))
        model.expectation_maximization(
            options.em_iter,
            options.num_cpus,
            options.model_path if options.save_checkpoint else None,
        )
        if not options.save_checkpoint:
            model.params.save(options.model_path)

    if (options.input_file is not None and options.output_file is not None
            and options.model_path is not None):
        model = unsupervised_morphology.MorphologyHMMParams.load(
            options.model_path)
        segmentor = unsupervised_morphology.MorphologySegmentor(model)
        segment_cache = {}
        writer = open(options.output_file, "w", encoding="utf-8")
        with open(options.input_file, "r", encoding="utf-8") as input_stream:
            for line in input_stream:
                output = []
                for word in line.strip().split():
                    if word not in segment_cache:
                        segmented = segmentor.segment_word(
                            word, add_affix_symbols=options.add_affix_symbols)
                        segment_cache[word] = segmented
                    output.append(segment_cache[word])
                writer.write(" ".join(output) + "\n")
        writer.close()

    if options.investigate and options.model_path is not None: