Ejemplo n.º 1
0
def test_sentence_tokenize_with_combined():
    corpus = SentenceTokenizer()
    expect = ["わんわん。", "「にゃ?」(にゃー)わんわん。", "「わおーん。」(犬より。)"]
    result = corpus.tokenize(DOCUMENT4)
    assert expect == result
Ejemplo n.º 2
0
def test_sentence_tokenize_with_quotation():
    corpus = SentenceTokenizer()
    expect = ["猫「にゃおにゃ。ただしかわいいものとする。異議は認める」。", "にゃお。", "にゃにゃ"]
    result = corpus.tokenize(DOCUMENT3)
    assert expect == result
Ejemplo n.º 3
0
def test_sentence_tokenize_with_bracket():
    corpus = SentenceTokenizer()
    expect = ["私は猫である(ただしかわいいものとする。異議は認める)。", "にゃお。", "にゃにゃ"]
    result = corpus.tokenize(DOCUMENT2)
    assert expect == result
Ejemplo n.º 4
0
def test_sentence_tokenize():
    corpus = SentenceTokenizer()
    expect = ["私は猫である。", "にゃお。", "にゃにゃ", "わんわん。", "にゃーにゃー。"]
    result = corpus.tokenize(DOCUMENT1)
    assert expect == result
Ejemplo n.º 5
0
def test_sentence_tokenize_with_custom_period():
    corpus = SentenceTokenizer(period=".")
    expect = ["わんわん。「にゃ?」(にゃー)わんわん.", "「わおーん。」(犬より。)"]
    result = corpus.tokenize(DOCUMENT6)
    assert expect == result
Ejemplo n.º 6
0
def test_sentence_tokenize_with_custom_patterns():
    corpus = SentenceTokenizer(patterns=SentenceTokenizer.PATTERNS +
                               [re.compile(r"『.*?』")])
    expect = ["わんわん。", "「にゃ?」(にゃー)わんわん。", "『わおーん。』(犬より。)"]
    result = corpus.tokenize(DOCUMENT5)
    assert expect == result