예제 #1
0
def test_segment(input_text):
    tokenizer = HMMTokenizer()
    tokenizer.load_model()

    result = tokenizer.segment(input_text)

    pytest.helpers.assert_token_equals(result, input_text)
예제 #2
0
class Tokenizer(object):
    def __init__(self, model_dir=None):
        if model_dir is None:
            model_dir = default_model_dir

        self.model_dir = model_dir

        self.dag_tokenizer = None  # type: DAGTokenizer
        self.hmm_tokenizer = None  # type: HMMTokenizer
        self.max_match_forward_tokenizer = None  # type: MaxMatchForwardTokenizer
        self.max_match_backward_tokenizer = None  # type: MaxMatchBackwardTokenizer
        self.max_match_bidirectional_tokenizer = None  # type: MaxMatchBidirectionalTokenizer
        self.crf_tokenizer = None  # type: CRFTokenizer

    def init_dag_tokenizer(self):
        if self.dag_tokenizer is None:
            self.dag_tokenizer = DAGTokenizer(self.model_dir)
            self.dag_tokenizer.load_model()

    def cut_by_DAG(self, message):
        self.init_dag_tokenizer()
        return self.dag_tokenizer.segment(message)

    def init_hmm_tokenizer(self):
        if self.hmm_tokenizer is None:
            self.hmm_tokenizer = HMMTokenizer(self.model_dir)
            self.hmm_tokenizer.load_model()

    def cut_by_HMM(self, message):
        self.init_hmm_tokenizer()
        return self.hmm_tokenizer.segment(message)

    def cut_by_joint_model(self, message):
        solutions = [self.cut_by_DAG(message), self.cut_by_HMM(message)]
        merge_solutions = MergeSolutions()
        best_solution = merge_solutions.merge(solutions)

        return best_solution

    cut = cut_by_DAG

    def init_max_match_forward_tokenizer(self):
        if self.max_match_forward_tokenizer is None:
            self.max_match_forward_tokenizer = MaxMatchForwardTokenizer()
            self.max_match_forward_tokenizer.load_model()

    def cut_by_max_match_forward(self, message):
        self.init_max_match_forward_tokenizer()
        return self.max_match_forward_tokenizer.segment(message)

    def init_max_match_backward_tokenizer(self):
        if self.max_match_backward_tokenizer is None:
            self.max_match_backward_tokenizer = MaxMatchBackwardTokenizer()
            self.max_match_backward_tokenizer.load_model()

    def cut_by_max_match_backward(self, message):
        self.init_max_match_backward_tokenizer()
        return self.max_match_backward_tokenizer.segment(message)

    def init_max_match_bidirectional_tokenizer(self):
        if self.max_match_bidirectional_tokenizer is None:
            self.max_match_bidirectional_tokenizer = MaxMatchBidirectionalTokenizer(
            )
            self.max_match_bidirectional_tokenizer.load_model()

    def cut_by_max_match_bidirectional(self, message):
        self.init_max_match_bidirectional_tokenizer()
        return self.max_match_bidirectional_tokenizer.segment(message)

    def init_crf_tokenizer(self):
        if self.crf_tokenizer is None:
            self.crf_tokenizer = CRFTokenizer()
            self.crf_tokenizer.load_model()

    def cut_by_CRF(self, message):
        self.init_crf_tokenizer()
        return self.crf_tokenizer.segment(message)

    def load_custom_dict(self, dict_file):
        # TODO: not implement yet
        pass

    def add_word(self, word, freq=None):
        # TODO: not implement yet
        pass

    def del_word(self, word):
        # TODO: not implement yet
        pass

    def load_user_dict(self, dict_file):
        return self.dag_tokenizer.dict_data.load_user_dict(dict_file)

    @property
    def mini_log_freq(self):
        # TODO: not implement yet
        pass

    @property
    def average_log_freq(self):
        # TODO: not implement yet
        pass