def url_manager_proc(self, url_que, conn_que, root_url): url_manager = UrlManager() url_manager.add_new_url(root_url) while True: while url_manager.has_new_url(): new_url = url_manager.get_new_url() # 将新的url发给工作节点 url_que.put(new_url) print('old_url=', url_manager.old_urls_size()) if url_manager.old_urls_size() > 2000: url_que.put('end') print('控制节点发出结束通知') # 关闭管理节点,同时存储set状态 url_manager.save_progress('new_urls.txt', url_manager.new_urls) url_manager.save_progress('old_urls.txt', url_manager.old_urls) return # 将从result_solve_proc 获取的urls添加到URL管理器 try: if not conn_que.empty(): urls = conn_que.get() for url in urls: url_manager.add_new_url(url) except BaseException: time.sleep(0.1)
class SpiderMan(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): # 添加入口 self.manager.add_new_url(root_url) while(self.manager.has_new_url() and self.manager.old_urls_size() < 100): try: new_url = self.manager.get_new_url() # print(new_url, '.......') html = self.downloader.download(new_url) # print(html) new_urls, data = self.parser.parse(new_url, html) self.manager.add_new_urls(new_urls) self.output.store_data(data) print('已经抓取 %s 个链接' % self.manager.old_urls_size()) except Exception as e: print(e) # print('crawl failed') self.output.output_html()