class Crawler(): def __init__(self, myconfig): # 线程池, self.thread_pool = ThreadPool(myconfig.threadnum) # 已访问的url集合 self.visited_urls = set() # set 不是线程安全,所以这里加一把锁 self.visited_urls_lock = threading.Lock() # 未访问的url集合 self.will_visited_urls = deque() self.will_visited_urls.append(myconfig.url) self.temp_q = deque() self.cur_depth = 0 self.status = "" self.myconfig = myconfig MyLogger(myconfig.logfile, myconfig.loglevel) #MyLogger(myconfig.logfile, loglevel = 5) # debug self.db = Db() def start(self): self.status = "start" while self.cur_depth < self.myconfig.depth: if self.status == "stop": break try: while self.will_visited_urls: url = self.will_visited_urls.popleft() # 添加工作,这里基本上没有阻塞,因为是在主线程里,只是负责 # 添加工作,真正执行工作是在线程里做的 self.thread_pool.add_job(self.handler, url) # # TODO: # 通知线程有活干了,这里可以看出是在将will_visited_urls的url # 都添加后才通知线程去干活的,这样设计,粒度似乎有点粗? # 如果还想节省时间的话,可以在url的数目 >= 线程初始数目的时候,就通知 # 线程池里的线程开始干活,如果url的数目 < 线程初始数目的时候,等都 # 添加完之后,再通知 #print ">>>>>>>> give event to threads in thread pool" # 通知线程池里的线程开始新一轮的抓取 self.thread_pool.event_do_job() # 主动退出调度,让子线程有时间可以执行 time.sleep(3) except Empty: # 需要访问的url没有了 logging.info("no url right now") finally: # 必须等线程池里的线程工作做完之后,才算本次深度的访问结束 # 这里做的处理是如果线程池里面有线程,则睡3s,再读, # 直到线程池里的工作线程为0才停下来 # 这样才算本次深度的抓取完毕 while True: #print "thread waiting num is %d, config thread num is %d" % (self.thread_pool.get_thread_waiting_num(), self.myconfig.thread) if self.thread_pool.get_thread_waiting_num() == self.myconfig.threadnum: # 如果等待的线程数目等于线程初始数目,则说明,所有线程都执行完毕 # 所以break break else: # 有线程仍然在执行,则说明, 本次深度的访问还没有结束 # 睡眠等待 time.sleep(10) #此次深度的访问结束,深度加一 self.cur_depth += 1 logging.info("crawler depth now is %s" % str(self.cur_depth)) if self.cur_depth > self.myconfig.depth: break # 从url中抓到的网页都放到了temp_q中, # 将temp_q中的网页从新给 will_visited_urls,继续 self.will_visited_urls = self.temp_q self.temp_q = deque() # 所有深度的url都抓取完毕 or 爬虫退出 self.thread_pool.stop_threads() logging.info("crawler exit") return def handler(self, url): content= self.get_html_content(url) if content == "" or content == None: # 无法获取content,直接返回 return # 添加此url为已访问过 self.add_url_to_visited(url) if content.find(self.myconfig.key) != -1: self.db.save_data(url, self.myconfig.key, content) try: hrefs = self.get_hrefs(content, url) except StandardError, se: logging.error("error: %s" % (se)) print se # log # 无法获取 hrefs return # 如果获得了hrefs if hrefs: # 将hrefs添加到 temp_q中,等本级深度访问完毕之后再访问 for link in hrefs: # 最后的考验 if not self.is_url_visited(link) \ and link not in self.will_visited_urls \ and link not in self.temp_q: #print "put %s into temp_q" % link self.temp_q.append(link)
class Crawler(object): def __init__(self, args): self.thread_num = args.thread_num self.output = args.output if not os.path.exists(self.output): os.mkdir(self.output) self.domain_pattern = re.compile( r"^([0-9a-zA-Z][0-9a-zA-Z-]{0,62}\.)+([0-9a-zA-Z][0-9a-zA-Z-]{0,62})\.?$") def _init(self): # 线程池,指定线程数 self.thread_pool = ThreadPool(self.thread_num) self.depth = 2 # 标注初始爬虫深度,从1开始 self.current_depth = 1 # 已访问的链接 self.visited_hrefs = set() # 待访问的链接 self.unvisited_hrefs = deque() # 标记爬虫是否开始执行任务 self.is_crawling = False self.resource_details = ResourceDetailCollection() def _format_url(self, raw_value): raw_value_str = raw_value.strip().strip('\n') if len(raw_value_str) <= 0: return '' if not self.domain_pattern.match(raw_value_str): return '' if not raw_value_str.startswith('http'): value = 'http://' + raw_value_str else: value = raw_value_str return value def crawl(self, url): self._init() formatted_url = self._format_url(url) self.resource_details.set_main_frame_url(formatted_url) self.unvisited_hrefs.append(formatted_url) print '\nStart Crawling url %s\n' % formatted_url self.is_crawling = True self.thread_pool.start_threads() while self.current_depth < self.depth + 1: # 分配任务,线程池并发下载当前深度的所有页面(该操作不阻塞) self._assigin_current_depth_tasks() # 等待当前线程池完成所有任务,当池内的所有任务完成时,即代表爬完了一个网页深度 # self.thread_pool.task_join()可代替以下操作,可无法Ctrl-C Interupt while self.thread_pool.get_task_left(): time.sleep(8) print 'Depth %d Finish. Totally visited %d links. \n' % ( self.current_depth, len(self.visited_hrefs)) log.info('Depth %d Finish. Total visited Links: %d\n' % ( self.current_depth, len(self.visited_hrefs))) self.current_depth += 1 # After finishing all the tasks, stop this crawling. print "all Tasks has finished" self._on_all_tasks_finished() self.stop() def stop(self): self.is_crawling = False self.thread_pool.stop_threads() def get_already_visited_num(self): # visitedHrefs保存已经分配给taskQueue的链接,有可能链接还在处理中。 # 因此真实的已访问链接数为visitedHrefs数减去待访问的链接数 return len(self.visited_hrefs) - self.thread_pool.get_task_left() def _on_all_tasks_finished(self): resource_detail_data = unicode(json.dumps( self.resource_details.to_json_data(), indent=4)) hashed_file_name = hashlib.new("md5", self.resource_details.main_frame_url).hexdigest() + ".json" resource_detail_dataPath = os.path.join(self.output, hashed_file_name) with io.open(resource_detail_dataPath, 'w') as file: file.write(unicode(resource_detail_data)) def _assigin_current_depth_tasks(self): mylock.acquire() copied_unvisited_hrefs = deque() while self.unvisited_hrefs: copied_unvisited_hrefs.append(self.unvisited_hrefs.popleft()) mylock.release() while copied_unvisited_hrefs: url = copied_unvisited_hrefs.popleft() # 标注该链接已被访问,或即将被访问,防止重复访问相同链接 self.visited_hrefs.add(url) # 向任务队列分配任务 self.thread_pool.put_task(self._task_handler, url) def _task_handler(self, url): # 先拿网页源码,再保存,两个都是高阻塞的操作,交给线程处理 url_fetcher = URLFetcher(url) retry = 1 if url_fetcher.fetch(retry): self._save_task_results(url, url_fetcher) self._add_unvisited_hrefs(url_fetcher) def _save_task_results(self, url, url_fetcher): print 'Visited URL : %s \n' % url response_headers = url_fetcher.get_response_headers() response_detail = ResourceDetail(url, url_fetcher.request_time, url_fetcher.response_time, response_headers) mylock.acquire() self.resource_details.add_detail(response_detail) mylock.release() def _add_unvisited_hrefs(self, url_fetcher): '''添加未访问的链接。将有效的url放进UnvisitedHrefs列表''' # 对链接进行过滤:1.只获取http或https网页;2.保证每个链接只访问一次 url, page_source = url_fetcher.get_data() hrefs = self.get_all_resource_hrefs(url, page_source) mylock.acquire() for href in hrefs: if self._is_http_or_https_protocol(href): if not self._is_href_repeated(href): self.unvisited_hrefs.append(href) mylock.release() def get_all_resource_hrefs(self, url, page_source): '''解析html源码,获取页面所有链接。返回链接列表''' hrefs = [] soup = BeautifulSoup(page_source) results = soup.find_all(True) for tag in results: href = None if tag.name == 'a': continue # 必须将链接encode为utf8, 因为中文文件链接如 http://aa.com/文件.pdf # 在bs4中不会被自动url编码,从而导致encodeException if tag.has_attr('href'): href = tag.get('href').encode('utf8') elif tag.has_attr('src'): href = tag.get('src').encode('utf8') if href is not None: if not href.startswith('http'): href = urljoin(url, href) # 处理相对链接的问题 hrefs.append(href) return hrefs def _is_http_or_https_protocol(self, href): protocal = urlparse(href).scheme if protocal == 'http' or protocal == 'https': return True return False def _is_href_repeated(self, href): if href in self.visited_hrefs or href in self.unvisited_hrefs: return True return False