This repository has been archived by the owner on Aug 30, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
CrawlerMain.py
136 lines (101 loc) · 3.58 KB
/
CrawlerMain.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
# -*- coding: utf-8 -*-
import threading
import time
import thread
from ApplicationShow import ApplicationShow
from Dispatch import Dispatch
from UrlManager import UrlManager
from HtmlParser import HtmlParser
# 操作运行程序
"""
确定要爬取的URL地址
加入urlManager
while,判断urlManager中是否有url{
从urlManager中取出一个待爬取的url
送入dispatch,爬取获得爬取的url地址和context
得到的context与爬取的url组成参数,放入ApplicationShow中
送context进入Parser,分析获得新的url地址
将url加入urlManager中
}
从Application中显示爬取的ApplicationShow
"""
class CrawlerMain(object):
def __init__(self):
self.G_STATE_OK = 200
self.crawMaxNum = -1
self.crawCountNum = 0
self.urlManager = UrlManager()
self.dispatch = Dispatch()
self.htmlParser = HtmlParser("http://baike.baidu.com")
self.applicationShow = ApplicationShow()
def __crawl(self, url):
"""
设定计数器,
如果crawMaxCount > 0 说明需要进行计数,
生成一个count计数器,最大搜索多少次
当count计数器大于crawMaxCount的时候停止
否则,不进行计数
"""
try:
self.dispatch.launch_request(url)
if self.dispatch.get_status() != self.G_STATE_OK:
return
context = self.dispatch.get_content()
self.htmlParser.set_content(context)
self.htmlParser.parser()
summary = self.htmlParser.get_summary()
title = self.htmlParser.get_title
urls = self.htmlParser.get_new_urls()
except Exception, e:
print "Error " + url + " " + str(e)
return
self.applicationShow.add(url, title, summary)
self.urlManager.add_url(urls)
def start(self, start_url):
self.urlManager.add_url([start_url])
while True:
threads = []
finish = False
for url in self.urlManager.get_urlSet():
if self.crawCountNum > self.crawMaxNum:
finish = True
break
else:
thread = threading.Thread(target=self.__crawl, args=(url,))
threads.append(thread)
self.crawCountNum += 1
self.__start_thread_pool(threads)
while not self.__is_task_done(threads):
time.sleep(1)
print self.crawCountNum
if finish:
return
def __start_thread_pool(self, threads):
for thread in threads :
thread.start()
def __is_task_done(self, threads):
finishThreadNum = 0
for thread in threads:
if not thread.is_alive():
finishThreadNum += 1
print "finish " + str(finishThreadNum)
if finishThreadNum == len(threads):
return True
else:
return False
def show(self):
self.applicationShow.show()
def set_craw_max_count(self, count):
self.crawMaxNum = count
if __name__ == "__main__":
main_url = "http://baike.baidu.com/view/1395656.htm"
crawlerMain = CrawlerMain()
max_count = 20000
crawlerMain.set_craw_max_count(max_count)
startTime = time.time()
crawlerMain.start(main_url)
endTime = time.time()
timeFile = open('time.txt', 'a')
timeFile.write(str(max_count) + "\t" + str(endTime - startTime) + '\n')
timeFile.close()
crawlerMain.show()