Example #1
0
def test_tokenize_korean_paragraph():
    tokenizer = Tokenizer('korean')
    expected = (
        '회사 동료 분들과 다녀왔는데 분위기도 좋고 음식도 맛있었어요',
        '다만, 강남 토끼 정이 강남 쉑쉑 버거 골목길로 쭉 올라가야 하는데 다들 쉑쉑버거의 유혹에 넘어갈 뻔 했답니다',
        '강남 역 맛 집 토끼정의 외부 모습.')

    paragraph = '회사 동료 분들과 다녀왔는데 분위기도 좋고 음식도 맛있었어요 다만, 강남 토끼정이 강남 쉑쉑버거 골목길로 쭉 올라가야 하는데 다들 쉑쉑버거의 유혹에 넘어갈 뻔 했답니다 강남역 맛집 토끼정의 외부 모습.'
    assert expected == tokenizer.to_sentences(paragraph)
Example #2
0
def test_tokenize_sentences_with_abbreviations():
    tokenizer = Tokenizer("english")
    sentences = tokenizer.to_sentences(
        "There are people who are weird, e.g. normal people. These people know you."
    )

    expected = (
        "There are people who are weird, e.g. normal people.",
        "These people know you.",
    )
    assert expected == sentences
Example #3
0
def test_tokenize_bangla_paragraph():
    tokenizer = Tokenizer('bangla')
    expected = (
        'মানুষের সুন্দর মুখ দেখে আনন্দিত হয়ো না।',
        'স্বভাবে সে সুন্দর নয়, দেখতে সুন্দর হলেও তার স্বভাব, তার স্পর্শ, তার রীতিনীতিকে মানুষ ঘৃণা করে।',
        'দুঃস্বভাবের মানুষ মানুষের হৃদয়ে জ্বালা ও বেদনা দেয়?',
        'তার সুন্দর মুখে মানুষ তৃপ্তি পায় না!')

    paragraph = 'মানুষের সুন্দর মুখ দেখে আনন্দিত হয়ো না। স্বভাবে সে সুন্দর নয়, দেখতে সুন্দর হলেও তার স্বভাব, তার স্পর্শ, তার রীতিনীতিকে মানুষ ঘৃণা করে। দুঃস্বভাবের মানুষ মানুষের হৃদয়ে জ্বালা ও বেদনা দেয়? তার সুন্দর মুখে মানুষ তৃপ্তি পায় না! '
    #print(tokenizer.to_sentences(paragraph))
    print(tokenizer.to_words(paragraph))
    assert expected == tokenizer.to_sentences(paragraph)
Example #4
0
def test_tokenize_paragraph():
    tokenizer = Tokenizer("english")
    sentences = tokenizer.to_sentences("""
        I am a very nice sentence with comma, but..
        This is next sentence. "I'm bored", said Pepek.
        Ou jee, duffman is here.
    """)

    expected = (
        "I am a very nice sentence with comma, but..",
        "This is next sentence.",
        '"I\'m bored", said Pepek.',
        "Ou jee, duffman is here.",
    )
    assert expected == sentences
Example #5
0
def test_ensure_czech_tokenizer_available():
    tokenizer = Tokenizer("czech")
    assert "czech" == tokenizer.language

    sentences = tokenizer.to_sentences("""
        Měl jsem sen, že toto je sen. Bylo to také zvláštní.
        Jakoby jsem plaval v moři rekurze.
    """)

    expected = (
        "Měl jsem sen, že toto je sen.",
        "Bylo to také zvláštní.",
        "Jakoby jsem plaval v moři rekurze.",
    )
    assert expected == sentences
Example #6
0
def test_slovak_alias_into_czech_tokenizer():
    tokenizer = Tokenizer("slovak")
    assert tokenizer.language == "slovak"

    sentences = tokenizer.to_sentences("""
        Je to veľmi fajn. Bodaj by nie.
        Ale na druhej strane čo je to oproti inému?
        To nechám na čitateľa.
    """)

    expected = (
        "Je to veľmi fajn.",
        "Bodaj by nie.",
        "Ale na druhej strane čo je to oproti inému?",
        "To nechám na čitateľa.",
    )
    assert expected == sentences
Example #7
0
def test_tokenize_chinese_paragraph():
    tokenizer = Tokenizer('chinese')
    expected = ('我正在为这个软件添加中文支持。', '这个软件是用于文档摘要!', '这个软件支持网页和文本两种输入格式?')

    paragraph = '我正在为这个软件添加中文支持。这个软件是用于文档摘要!这个软件支持网页和文本两种输入格式?'
    assert expected == tokenizer.to_sentences(paragraph)
Example #8
0
def test_tokenize_japanese_paragraph():
    tokenizer = Tokenizer('japanese')
    expected = ('1つ目の文章です。', 'その次は何が来ますか?', '「2つ目の文章」です。')
    paragraph = '1つ目の文章です。その次は何が来ますか? 「2つ目の文章」です。'
    assert expected == tokenizer.to_sentences(paragraph)