def test_main():
    from MicroTokenizer import (
        hmm_tokenizer,
        crf_tokenizer,
        dag_tokenizer,
        max_match_forward_tokenizer,
        max_match_backward_tokenizer,
        max_match_bidirectional_tokenizer,
    )

    input_text = "王小明在北京的清华大学读书。"

    # 使用相关的算法来分词。

    result = hmm_tokenizer.segment(input_text)
    print(result)

    result = crf_tokenizer.segment(input_text)
    print(result)

    result = max_match_forward_tokenizer.segment(input_text)
    print(result)

    result = max_match_backward_tokenizer.segment(input_text)
    print(result)

    result = max_match_bidirectional_tokenizer.segment(input_text)
    print(result)

    result = dag_tokenizer.segment(input_text)
    print(result)
Ejemplo n.º 2
0
def _cut_by_dag_hmm_joint_model(message):
    solutions = [
        dag_tokenizer.segment(message),
        hmm_tokenizer.segment(message)
    ]
    merge_solutions = MergeSolutions()
    best_solution = merge_solutions.merge(solutions)

    return best_solution
def test_main():
    from MicroTokenizer import dag_tokenizer

    tokens = dag_tokenizer.segment("知识就是力量")
    print(tokens)
Ejemplo n.º 4
0
def cut(message, HMM=False):
    if HMM:
        return _cut_by_dag_hmm_joint_model(message)
    else:
        return dag_tokenizer.segment(message)
Ejemplo n.º 5
0
from MicroTokenizer import (
    hmm_tokenizer,
    crf_tokenizer,
    dag_tokenizer,
    max_match_forward_tokenizer,
    max_match_backward_tokenizer,
    max_match_bidirectional_tokenizer,
)

input_text = "王小明在北京的清华大学读书。"

# 使用相关的算法来分词。

result = hmm_tokenizer.segment(input_text)
print(result)

result = crf_tokenizer.segment(input_text)
print(result)

result = max_match_forward_tokenizer.segment(input_text)
print(result)

result = max_match_backward_tokenizer.segment(input_text)
print(result)

result = max_match_bidirectional_tokenizer.segment(input_text)
print(result)

result = dag_tokenizer.segment(input_text)
print(result)