Ejemplo n.º 1
0
def test_on_data():
    with open(ORIGINAL_TRAIN_PATH, 'r') as raw_file:
        weird_chars = {'thatas', 'thereas'}
        counter = 0
        for idx, raw_line in enumerate(raw_file):
            text = ' '.join(tokenizer(raw_line))
            for char in weird_chars:
                if char in text:
                    print text
                    counter += 1
    assert counter == 0
Ejemplo n.º 2
0
def test_malformed():
    for text1, text2 in TEXT_MALFORMED.iteritems():
        tokens1 = tokenizer(text1)
        tokens2 = tokenizer(text2)
        assert tokens1 == tokens2
Ejemplo n.º 3
0
def test_ntokens():
    for text, ntokens in TEXT_NTOKENS.iteritems():
        tokens = tokenizer(text)
        assert ntokens == len(tokens)