def tokenize(txt, included=True, replace_tags=None): txt = pp.preprocess(txt, included=included) if replace_tags is not None: txt = replace_tags(txt) tokens = [] for (v1, v2, v3, v4, v5) in splitrx.findall(txt): if v5: tokens.append((5, v5)) elif v4: tokens.append((4, v4)) elif v3: tokens.append((3, v3)) elif v2: tokens.append((2, v2)) elif v1: tokens.append((1, v1)) tokens.append((None, '')) return tokens
def preprocess(s, expected, included=True): res = pp.preprocess(s, included=included) print "preprocess(%r) -> %r" % (s, res) if expected is not None: assert res == expected, "bad preprocess result"