/
2htmlparser.py
61 lines (49 loc) · 1.62 KB
/
2htmlparser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import HTMLParser
from xml.etree import ElementTree
class NaiveHTMLParser(HTMLParser):
"""
Python 3.x HTMLParser extension with ElementTree support.
@see https://github.com/marmelo/python-htmlparser
"""
def __init__(self):
self.root = None
self.tree = []
HTMLParser.__init__(self)
def feed(self, data):
HTMLParser.feed(self, data)
return self.root
def handle_starttag(self, tag, attrs):
if len(self.tree) == 0:
element = ElementTree.Element(tag, dict(self.__filter_attrs(attrs)))
self.tree.append(element)
self.root = element
else:
element = ElementTree.SubElement(self.tree[-1], tag, dict(self.__filter_attrs(attrs)))
self.tree.append(element)
def handle_endtag(self, tag):
self.tree.pop()
def handle_startendtag(self, tag, attrs):
self.handle_starttag(tag, attrs)
self.handle_endtag(tag)
pass
def handle_data(self, data):
if self.tree:
self.tree[-1].text = data
def get_root_element(self):
return self.root
def __filter_attrs(self, attrs):
return filter(lambda x: x[0] and x[1], attrs) if attrs else []
html=""
with open('2.html', 'r') as content_file:
html = content_file.read()
h = HTMLParser()
parser = NaiveHTMLParser(h)
root = parser.feed(html)
parser.close()
# root is an xml.etree.Element and supports the ElementTree API
# (e.g. you may use its limited support for XPath expressions)
# get title
#print(root.find('head/title').text)
# get all anchors
for a in root.findall('table'):
print(a)