Example #1
0
def test_expand():
    '''Testing expansions'''
    tok = Tokenizer()
    assert tok.expand("Foo[x3]!") == "Foo Foo Foo!"
    assert tok.expand("Foo [x 3]!") == "Foo Foo Foo!"
    assert tok.expand("Foo [x3]!") == "Foo Foo Foo!"
    assert tok.expand("Foo [x3] bar!") == "Foo Foo Foo bar!"
    assert tok.expand("Foo [x3] bar [x3]!") == "Foo Foo Foo bar bar bar!"
    assert tok.expand("Foo <bar baz>[x2]!") == "Foo bar baz bar baz!" 
Example #2
0
def test_replace():
    '''Testing replacements in tokenization method'''
    tok = Tokenizer()
    assert tok.decontract("hey gimme") == 'hey give me'
    assert tok.decontract("hey let's") == 'hey let_us'
    assert tok.decontract("hey wanna go") == 'hey want to go'
    assert tok.decontract("hey gotta go") == 'hey got to go'
    assert tok.decontract("hey gonna go") == 'hey going to go'
    assert tok.decontract("hey cannot go") == 'hey can not go'
    assert tok.decontract("lookit here") == 'look at here'
    assert tok.tokenize("hey gimme") == ['hey', 'give', 'me']
    assert tok.tokenize("hey let's") == ['hey', 'let_us']
    assert tok.tokenize("hey wanna go") == ['hey', 'want', 'to', 'go']
    assert tok.tokenize("hey gotta go") == ['hey', 'got', 'to', 'go']
    assert tok.tokenize("hey gonna go") == ['hey', 'going', 'to', 'go']
    assert tok.tokenize("hey cannot go") == ['hey', 'can', 'not', 'go']
    assert tok.tokenize("lookit here") == ['look', 'at', 'here']
Example #3
0
def test_tokenize():
    '''Testing the tokenize method'''
    tok = Tokenizer()
    assert tok.tokenize("") == []
    assert tok.tokenize("###") == []
    assert tok.tokenize("###") == []
    assert tok.tokenize("--") == []
    assert tok.tokenize("Foo Bar Baz.") == ["Foo", "Bar", "Baz"]
    assert tok.tokenize("Don't do that!") == ["Don't", "do", "that"]
    assert tok.tokenize("\"Don't do that!\"") == ["Don't", "do", "that"]
    assert tok.tokenize("'Foo' Bar Baz.") == ["Foo", "Bar", "Baz"]
    tok = Tokenizer(split_clitics=True)
    assert tok.tokenize("Don't do that!") == ["Do", "n't", "do", "that"]
    assert tok.tokenize("Foo'll go.") == ["Foo", "'ll", "go"]
    assert tok.tokenize("I'm new.") == ["I", "'m", "new"]
    assert tok.tokenize("He's got to go.") == ["He", "'s", "got", "to", "go"]
    assert tok.tokenize("Foo_Bar Baz.") == ["Foo_Bar", "Baz"]
    assert tok.tokenize("James' friend.") == ["James", "'s", "friend"]
    assert tok.tokenize("Piglet's eyes") == ["Piglet", "'s", "eyes"]
    tok = Tokenizer(nonword=r'[^a-zA-Z\+\'\-\&\@]')
    assert tok.tokenize("Foo_Bar Baz.") == ["Foo", "Bar", "Baz"]