def test_tokenize_japanese_paragraph():
    tokenizer = Tokenizer('japanese')
    expected = (
        '1つ目の文章です。',
        'その次は何が来ますか?',
        '「2つ目の文章」です。'
    )
    paragraph = '1つ目の文章です。その次は何が来ますか? 「2つ目の文章」です。'
    assert expected == tokenizer.to_sentences(paragraph)
Beispiel #2
0
def test_tokenize_korean_paragraph():
    tokenizer = Tokenizer('korean')
    expected = (
        '회사 동료 분들과 다녀왔는데 분위기도 좋고 음식도 맛있었어요',
        '다만, 강남 토끼 정이 강남 쉑쉑 버거 골목길로 쭉 올라가야 하는데 다들 쉑쉑버거의 유혹에 넘어갈 뻔 했답니다',
        '강남 역 맛 집 토끼정의 외부 모습.')

    paragraph = '회사 동료 분들과 다녀왔는데 분위기도 좋고 음식도 맛있었어요 다만, 강남 토끼정이 강남 쉑쉑버거 골목길로 쭉 올라가야 하는데 다들 쉑쉑버거의 유혹에 넘어갈 뻔 했답니다 강남역 맛집 토끼정의 외부 모습.'
    assert expected == tokenizer.to_sentences(paragraph)
def test_tokenize_chinese_paragraph():
    tokenizer = Tokenizer('chinese')
    expected = (
        '我正在为这个软件添加中文支持。',
        '这个软件是用于文档摘要!',
        '这个软件支持网页和文本两种输入格式?'
    )

    paragraph = '我正在为这个软件添加中文支持。这个软件是用于文档摘要!这个软件支持网页和文本两种输入格式?'
    assert expected == tokenizer.to_sentences(paragraph)
Beispiel #4
0
def test_tokenize_sentences_with_abbreviations():
    tokenizer = Tokenizer("english")
    sentences = tokenizer.to_sentences(
        "There are people who are weird, e.g. normal people. These people know you."
    )

    expected = (
        "There are people who are weird, e.g. normal people.",
        "These people know you.",
    )
    assert expected == sentences
def test_tokenize_paragraph():
    tokenizer = Tokenizer("english")
    sentences = tokenizer.to_sentences("""
        I am a very nice sentence with comma, but..
        This is next sentence. "I'm bored", said Pepek.
        Ou jee, duffman is here.
    """)

    expected = (
        "I am a very nice sentence with comma, but..",
        "This is next sentence.",
        '"I\'m bored", said Pepek.',
        "Ou jee, duffman is here.",
    )
    assert expected == sentences
def test_ensure_czech_tokenizer_available():
    tokenizer = Tokenizer("czech")
    assert "czech" == tokenizer.language

    sentences = tokenizer.to_sentences("""
        Měl jsem sen, že toto je sen. Bylo to také zvláštní.
        Jakoby jsem plaval v moři rekurze.
    """)

    expected = (
        "Měl jsem sen, že toto je sen.",
        "Bylo to také zvláštní.",
        "Jakoby jsem plaval v moři rekurze.",
    )
    assert expected == sentences
Beispiel #7
0
    def test_ensure_czech_tokenizer_available(self):
        tokenizer = Tokenizer("czech")
        self.assertEqual("czech", tokenizer.language)

        sentences = tokenizer.to_sentences("""
            Měl jsem sen, že toto je sen. Bylo to také zvláštní.
            Jakoby jsem plaval v moři rekurze.
        """)

        expected = (
            "Měl jsem sen, že toto je sen.",
            "Bylo to také zvláštní.",
            "Jakoby jsem plaval v moři rekurze.",
        )
        self.assertEqual(expected, sentences)
Beispiel #8
0
    def test_ensure_czech_tokenizer_available(self):
        tokenizer = Tokenizer("czech")
        self.assertEqual("czech", tokenizer.language)

        sentences = tokenizer.to_sentences("""
            Měl jsem sen, že toto je sen. Bylo to také zvláštní.
            Jakoby jsem plaval v moři rekurze.
        """)

        expected = (
            "Měl jsem sen, že toto je sen.",
            "Bylo to také zvláštní.",
            "Jakoby jsem plaval v moři rekurze.",
        )
        self.assertEqual(expected, sentences)
def test_slovak_alias_into_czech_tokenizer():
    tokenizer = Tokenizer("slovak")
    assert tokenizer.language == "slovak"

    sentences = tokenizer.to_sentences("""
        Je to veľmi fajn. Bodaj by nie.
        Ale na druhej strane čo je to oproti inému?
        To nechám na čitateľa.
    """)

    expected = (
        "Je to veľmi fajn.",
        "Bodaj by nie.",
        "Ale na druhej strane čo je to oproti inému?",
        "To nechám na čitateľa.",
    )
    assert expected == sentences
Beispiel #10
0
    def test_slovak_alias_into_czech_tokenizer(self):
        tokenizer = Tokenizer("slovak")
        self.assertEqual(tokenizer.language, "slovak")

        sentences = tokenizer.to_sentences("""
            Je to veľmi fajn. Bodaj by nie.
            Ale na druhej strane čo je to oproti inému?
            To nechám na čitateľa.
        """)

        expected = (
            "Je to veľmi fajn.",
            "Bodaj by nie.",
            "Ale na druhej strane čo je to oproti inému?",
            "To nechám na čitateľa.",
        )
        self.assertEqual(expected, sentences)
Beispiel #11
0
    def test_slovak_alias_into_czech_tokenizer(self):
        tokenizer = Tokenizer("slovak")
        self.assertEqual(tokenizer.language, "slovak")

        sentences = tokenizer.to_sentences("""
            Je to veľmi fajn. Bodaj by nie.
            Ale na druhej strane čo je to oproti inému?
            To nechám na čitateľa.
        """)

        expected = (
            "Je to veľmi fajn.",
            "Bodaj by nie.",
            "Ale na druhej strane čo je to oproti inému?",
            "To nechám na čitateľa.",
        )
        self.assertEqual(expected, sentences)
Beispiel #12
0
def test_tokenize_sentences_with_abbreviations():
    tokenizer = Tokenizer("english")
    sentences = tokenizer.to_sentences("There are people who are weird, e.g. normal people. These people know you.")

    expected = ("There are people who are weird, e.g. normal people.", "These people know you.",)
    assert expected == sentences
Beispiel #13
0
 def test_tokenize_japanese_paragraph(self):
     tokenizer = Tokenizer('japanese')
     expected = ('1つ目の文章です。', 'その次は何が来ますか?', '「2つ目の文章」です。')
     paragraph = '1つ目の文章です。その次は何が来ますか? 「2つ目の文章」です。'
     self.assertEqual(expected, tokenizer.to_sentences(paragraph))