class SpiderMan(object): def __init__(self): self.manager = UrlManager() self.parser = HtmlParser() self.downloader = HtmlDownloader() self.output = DataOutput() def crawl(self, root_url): """ 程序主逻辑 :param root_url: 入口 url :return: """ self.manager.add_new_url(root_url) while self.manager.has_new_url() and self.manager.old_url_size() < 20: try: new_url = self.manager.get_new_url() html = self.downloader.downloader(new_url) new_urls, data = self.parser.parser(new_url, html) self.manager.add_new_urls(new_urls) self.output.output_txt(data) print(data) print("爬取了{}条链接".format(self.manager.old_url_size())) except Exception as e: print("爬取失败", e)
class SpiderMan(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): self.manager.add_new_url(root_url) while (self.manager.has_new_urls() and self.manager.old_url_size() < 100): #try: # 获取新的url new_url = self.manager.get_new_url() # 下载器下载网页 html = self.downloader.download(new_url) # 解析器抽取网页数据 new_urls, data = self.parser.parser(new_url, html) # 添加UR管理器 self.manager.add_new_urls(new_urls) # 数据存储文件 self.output.store_data(data) print("已经抓取 %s 个链接" % self.manager.old_url_size()) #except Exception, e: # print("crawl failded", e) self.output.out_put_html()