class Crawler(object): """Main part, carwl the site""" def __init__(self, args): # 抓取深度 self.max_deepth = args['deepth'] # 指定当前深度 self.current_deepth = 1 # 线程管理 self.threadPool = ThreadPool(args['threads']) # 指定存取数据库文件 self.dbfile = args['dbfile'] # 指定关键字 self.keyword = args['keyword'] # 是否自测 self.testself = args['testself'] # 当前层待访问的链接,用集合来去重 self.unvisitedUrl = set() self.unvisitedUrl.add(args['url']) # 已访问的链接 self.visitedUrl = set() self.q = Queue() # http header self.header = { 'Accetpt': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accetpt-Encoding': 'gzip,deflate,sdch', 'Connection': 'keep-alive', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.76 Safari/537.36' } # 连接数据库 self.connDB() self.isRunning = True def start(self): self.threadPool.startThreads() # 判断当前深度 while self.current_deepth <= self.max_deepth: self.taskQueue() while not self.q.empty(): url = self.q.get() # 往线程池中添加任务 self.threadPool.addJob(self.getLinks, url) self.threadPool.workJoin() # 等待所有线程完成 self.current_deepth += 1 # 爬取结束 self.isRunning = False self.closeDB() def fetchPage(self, url, retry=3): '''获取页面内容''' try: self.r = requests.get(url, headers=self.header, timeout=3) if self.r.status_code == requests.codes.ok: source = self.r.text self.writeDB(url, source) return source except Exception, e: if retry>0: return self.fetchPage(url, retry-1) else: logging.error('Open failed for 3 time: %s' % url)