Esempio n. 1
0
class Crawler(object):
	"""Main part, carwl the site"""
	def __init__(self, args):
		# 抓取深度
		self.max_deepth = args['deepth']
		# 指定当前深度
		self.current_deepth = 1
		# 线程管理
		self.threadPool = ThreadPool(args['threads'])
		# 指定存取数据库文件
		self.dbfile = args['dbfile']
		# 指定关键字
		self.keyword = args['keyword']
		# 是否自测
		self.testself = args['testself']
		# 当前层待访问的链接,用集合来去重
		self.unvisitedUrl = set()
		self.unvisitedUrl.add(args['url'])
		# 已访问的链接
		self.visitedUrl = set()
		self.q = Queue()
		# http header
		self.header = {
			'Accetpt': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
			'Accetpt-Encoding': 'gzip,deflate,sdch',
			'Connection': 'keep-alive',
			'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.76 Safari/537.36'
		}
		# 连接数据库
		self.connDB()

		self.isRunning = True

	def start(self):
		self.threadPool.startThreads()
		# 判断当前深度
		while self.current_deepth <= self.max_deepth:
			self.taskQueue()
			while not self.q.empty():
				url = self.q.get()
				# 往线程池中添加任务
				self.threadPool.addJob(self.getLinks, url)
			self.threadPool.workJoin() # 等待所有线程完成
			self.current_deepth += 1
		# 爬取结束
		self.isRunning = False
		self.closeDB()

	def fetchPage(self, url, retry=3):
		'''获取页面内容'''
		try:
			self.r = requests.get(url, headers=self.header, timeout=3)
			if self.r.status_code == requests.codes.ok:
				source = self.r.text
				self.writeDB(url, source)
				return source
		except Exception, e:
			if retry>0:
				return self.fetchPage(url, retry-1)
			else:
				logging.error('Open failed for 3 time: %s' % url)