forked from jpzeng/CrawlerAPPL
/
Prog-7-html5lib-test.py
44 lines (39 loc) · 2.05 KB
/
Prog-7-html5lib-test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
# -*- coding: utf-8 -*-
import html5lib
print('通过指定treebuilder来解析:')
document='<html><head><title>Test</title></head><body><h1 align="center">Big data news</h1><h1 align="center">AI news</h1><h1 align="right">2018.8.1</h1></body></html>'
#直接调用html5lib.parse来解析,解析时采用lxml构建树的方法
content=html5lib.parse(document,treebuilder="lxml",namespaceHTMLElements=False)
#指定要提取的内容所在的标签路径
rows=content.xpath('/html/body/h1')
for row in rows:
t=row.xpath('./text()')[0] #定位到标签节点后,通过text()提取内容
print(t)
print('通过指定tree来解析:')
document='<html><head><title>Test</title></head><body><h1 align="center">Big data news</h1><h1 align="center">AI news</h1><h1 align="right">2018.8.1</h1></body></html>'
#构造HTMLParser实例,指定构造lxml的树
p=html5lib.HTMLParser(strict=False,tree=html5lib.getTreeBuilder('lxml'),namespaceHTMLElements=False)
#解析HTML文档
t=p.parse(document)
rows=t.xpath('/html/body/h1')
for row in rows:
t=row.xpath('./text()')[0]
print(t)
print('通过指定tree来提取超链接:')
document='<html><head><title>Test</title></head><body><a href="www.baidu.com">baidu</body></html>'
p=html5lib.HTMLParser(strict=False,tree=html5lib.getTreeBuilder('lxml'),namespaceHTMLElements=False)
t=p.parse(document)
#通过findall来查找所有标签名称为a的节点
a_tags = t.findall(".//a")
for a in a_tags:
url = a.attrib["href"] #通过属性名称来获得属性值
print(url)
print('处理标签不完整或有错误的HTML:')
#这个HTML文档中有不完整的标签:缺少一个</h1>、有错误的标签:</m1>
document='<html><head><title>Test</title></head><body><h1 align="center">Big data news</h1><h1 align="center">AI news<h1 align="right">2018.8.1</m1></body></html>'
p=html5lib.HTMLParser(strict=False,tree=html5lib.getTreeBuilder('lxml'),namespaceHTMLElements=False)
t=p.parse(document)
rows=t.xpath('/html/body/h1')
for row in rows:
t=row.xpath('./text()')[0]
print(t)