Beispiel #1
0
def test_word_tokenizer_config():
    sentence = 'This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--'

    punct_tokenized_words = [
        'This',
        'is',
        'a',
        'cooool',
        '#',
        'dummysmiley',
        ':',
        ':-)',
        ':-',
        'P',
        '<',
        '3',
        'and',
        'some',
        'arrows',
        '<',
        '>',
        '->',
        '<--',
    ]

    # Default
    r = Rake()
    assert punct_tokenized_words == r._tokenize_sentence_to_words(sentence)

    # Punct tokenize.
    r = Rake(word_tokenizer=nltk.tokenize.wordpunct_tokenize)
    assert punct_tokenized_words == r._tokenize_sentence_to_words(sentence)

    # Custom tokenizer. (Tweet)
    r = Rake(word_tokenizer=nltk.tokenize.TweetTokenizer().tokenize)
    assert [
        'This',
        'is',
        'a',
        'cooool',
        '#dummysmiley',
        ':',
        ':-)',
        ':-P',
        '<3',
        'and',
        'some',
        'arrows',
        '<',
        '>',
        '->',
        '<--',
    ] == r._tokenize_sentence_to_words(sentence)