-
Notifications
You must be signed in to change notification settings - Fork 0
/
test.py
37 lines (29 loc) · 1.08 KB
/
test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
# coding = utf-8
import DataOutput
import HtmlDownloader
import HtmlParser
import URLManager
class SpiderMan(object):
def __init__(self):
self.manager = URLManager()
self.downloader = HtmlDownloader()
self.parser = HtmlParser()
self.output = DataOutput()
def crawl(self, root_url):
self.manager.add_new_url(root_url)
while (self.manager.has_new_url() and self.manager.old_url_size() < 100):
try:
new_url = self.manager.get_new_url()
html = self.downloader.download(new_url)
new_urls,data = self.parser(new_url)
self.manager.add_new_url(new_urls)
self.output.store_data(data)
print('已经抓取了{}个连接'.format(self.manager.old_url_size()))
except Exception:
print('爬取失败')
self.output.output_html()
if __name__ == '__main__':
spider_man = SpiderMan()
spider_man.crawl("https://baike.baidu.com/item/%E7%BD%91%E7%BB%9C%E7%88%AC%E8%99%AB")
#github test
#2018年5月30日19:21:58