def test_word_tokenize_stopwords():
    assert generator_cmp(word_tokenize('This is a lot of stopwords'),
                         ['lot', 'stopwords'])

    test_case = 'I should get an empty list'
    assert generator_cmp(word_tokenize(test_case, test_case.split()), [])
    assert generator_cmp(word_tokenize(test_case, []),
                         ['should', 'get', 'an', 'empty', 'list'])
def test_word_tokenize_remove_urls():
    assert generator_cmp(
        word_tokenize('This is a www.google.com hello', remove_urls=True),
        ['hello'])
    assert generator_cmp(
        word_tokenize('This is another maps.google.com without',
                      remove_urls=False),
        ['another', 'mapsgooglecom', 'without'])
def test_word_tokenize_remove_urls():
    assert generator_cmp(
        word_tokenize('This is a www.google.com hello', remove_urls=True),
        ['hello']

    )
    assert generator_cmp(
        word_tokenize('This is another maps.google.com without', remove_urls=False),
        ['another', 'mapsgooglecom', 'without']
    )
def test_word_tokenize():
    assert generator_cmp(word_tokenize('Hello cruel world'), ['hello', 'cruel', 'world'])
    assert generator_cmp(word_tokenize(''), [])
    assert generator_cmp(word_tokenize('empty +@@ punctuation'), ['empty', 'punctuation'])
    assert generator_cmp(word_tokenize('This shouldn\'t fail'), ['shouldnt', 'fail'])
    assert generator_cmp(word_tokenize('Cat and dog'), ['cat', 'dog'])
    assert generator_cmp(word_tokenize('I own a Dell laptop'), ['dell', 'laptop'])  # Regression test
def test_word_tokenize():
    assert generator_cmp(word_tokenize('Hello cruel world'),
                         ['hello', 'cruel', 'world'])
    assert generator_cmp(word_tokenize(''), [])
    assert generator_cmp(word_tokenize('empty +@@ punctuation'),
                         ['empty', 'punctuation'])
    assert generator_cmp(word_tokenize('This shouldn\'t fail'),
                         ['shouldnt', 'fail'])
    assert generator_cmp(word_tokenize('Cat and dog'), ['cat', 'dog'])
    assert generator_cmp(word_tokenize('I own a Dell laptop'),
                         ['dell', 'laptop'])  # Regression test
def test_word_tokenize_large_whitespace():
    assert generator_cmp(word_tokenize('This  \n   is \r a   \ttest'), ['test'])
def test_word_tokenize_punctuation():
    # Punctuation should always be removed from front and back
    assert generator_cmp(word_tokenize('!My name is Michael!'), ['name', 'michael'])
def test_word_tokenize_digits():
    # Pure digits should be ignored but combinations of digits and letters should be included
    assert generator_cmp(word_tokenize('gumball800 is cool'), ['gumball800', 'cool'])
    assert generator_cmp(word_tokenize('90 + ten'), ['ten'])
def test_word_tokenize_single_letters():
    # Single letter tokens should be completely ignored
    assert generator_cmp(word_tokenize('a e i o u vowels', []), ['vowels'])
    assert generator_cmp(word_tokenize('!!!@#@##@#I *a Gold', []), ['gold'])
    assert generator_cmp(word_tokenize('aa i', []), ['aa'])
def test_word_tokenize_stopwords():
    assert generator_cmp(word_tokenize('This is a lot of stopwords'), ['lot', 'stopwords'])

    test_case = 'I should get an empty list'
    assert generator_cmp(word_tokenize(test_case, test_case.split()), [])
    assert generator_cmp(word_tokenize(test_case, []), ['should', 'get', 'an', 'empty', 'list'])
def test_word_tokenize_special_punctuation():
    assert generator_cmp(word_tokenize('self-determination'), ['self', 'determination'])
    assert generator_cmp(word_tokenize('Red/Green'), ['red', 'green'])
    assert generator_cmp(word_tokenize('Red\\Green'), ['red', 'green'])
def test_word_tokenize_large_whitespace():
    assert generator_cmp(word_tokenize('This  \n   is \r a   \ttest'),
                         ['test'])
def test_word_tokenize_punctuation():
    # Punctuation should always be removed from front and back
    assert generator_cmp(word_tokenize('!My name is Michael!'),
                         ['name', 'michael'])
def test_word_tokenize_digits():
    # Pure digits should be ignored but combinations of digits and letters should be included
    assert generator_cmp(word_tokenize('gumball800 is cool'),
                         ['gumball800', 'cool'])
    assert generator_cmp(word_tokenize('90 + ten'), ['ten'])
def test_word_tokenize_single_letters():
    # Single letter tokens should be completely ignored
    assert generator_cmp(word_tokenize('a e i o u vowels', []), ['vowels'])
    assert generator_cmp(word_tokenize('!!!@#@##@#I *a Gold', []), ['gold'])
    assert generator_cmp(word_tokenize('aa i', []), ['aa'])
def test_word_tokenize_special_punctuation():
    assert generator_cmp(word_tokenize('self-determination'),
                         ['self', 'determination'])
    assert generator_cmp(word_tokenize('Red/Green'), ['red', 'green'])
    assert generator_cmp(word_tokenize('Red\\Green'), ['red', 'green'])