def test_on_data(): with open(ORIGINAL_TRAIN_PATH, 'r') as raw_file: weird_chars = {'thatas', 'thereas'} counter = 0 for idx, raw_line in enumerate(raw_file): text = ' '.join(tokenizer(raw_line)) for char in weird_chars: if char in text: print text counter += 1 assert counter == 0
def test_malformed(): for text1, text2 in TEXT_MALFORMED.iteritems(): tokens1 = tokenizer(text1) tokens2 = tokenizer(text2) assert tokens1 == tokens2
def test_ntokens(): for text, ntokens in TEXT_NTOKENS.iteritems(): tokens = tokenizer(text) assert ntokens == len(tokens)