Example #1
0
def tokenize(txt, included=True, replace_tags=None):
    txt = pp.preprocess(txt, included=included)

    if replace_tags is not None:
        txt = replace_tags(txt)

    tokens = []
    for (v1, v2, v3, v4, v5) in splitrx.findall(txt):
        if v5:
            tokens.append((5, v5))
        elif v4:
            tokens.append((4, v4))
        elif v3:
            tokens.append((3, v3))
        elif v2:
            tokens.append((2, v2))
        elif v1:
            tokens.append((1, v1))

    tokens.append((None, ''))

    return tokens
Example #2
0
def preprocess(s, expected, included=True):
    res = pp.preprocess(s, included=included)
    print "preprocess(%r) -> %r" % (s, res)
    if expected is not None:
        assert res == expected, "bad preprocess result"