def parse(str): """Parse a string into an XML document.""" from lib.easy.regex import grp, rep, opt, alt, matchone, matchall id = '[A-Za-z_][A-Za-z0-9_:-]*' string = alt('"[^"]*"', "'[^']*'") ws = r'[\n\r\t ]*' wsr = r'[\n\r\t ]+' attr = grp(id, 'name') + ws + '=' + ws + grp(string, 'value') attr_anon = id + ws + '=' + ws + string tag = '<' + ws + grp(id, 'name') + alt(ws, wsr + grp(rep(attr_anon + ws, '+'), 'attrs')) + opt(grp('/', 'end')) + ws + '>' tag_anon = '<' + ws + id + alt(ws, wsr + rep(attr_anon + ws, '+')) + opt('/') + ws + '>' tagend = '<' + ws + '/' + ws + grp(id, 'name') + ws + '>' tagend_anon = '<' + ws + '/' + ws + id + ws + '>' declaration = '<[?!][^<>]+>' data_elem = alt(grp('[^<&]+', 'char'), '&' + grp(rep('[a-z]', '+'), 'name') + ';') data_elem_anon = alt('[^<&]+', '&' + rep('[a-z]', '+') + ';') data = rep(data_elem_anon, '+') handler = ParseHandler() def parse_data(data): def fn(match): if 'char' in match: return match['char'] else: return _xml_entities[match['name']] return ''.join(map(fn, matchall(data_elem, data))) for i in matchall(alt(grp(tag_anon, 'tag'), grp(tagend_anon, 'tagend'), grp(data, 'data'), declaration), str): # XML entities and declarations are ignored if 'tag' in i: match = matchone(tag, i['tag']) attrs = { } if 'attrs' in match: for j in matchall(attr + ws, match['attrs']): attrs[j['name']] = parse_data(j['value'][1:-1]) if 'end' in match: handler.tag(match['name'], attrs) else: handler.tag_start(match['name'], attrs) elif 'tagend' in i: handler.tag_end(matchone(tagend, i['tagend'])['name']) elif 'data' in i: handler.data(parse_data(i['data'])) return handler.result
def parse_data(data): def fn(match): if 'char' in match: return match['char'] else: return _xml_entities[match['name']] return ''.join(map(fn, matchall(data_elem, data)))