class SpiderMan(object): def __init__(self): self.manager = URLManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): self.manager.add_new_url(root_url) while (self.manager.has_new_url() and self.manager.old_url_size() < 100): try: new_url = self.manager.get_new_url() html = self.downloader.download(new_url) new_urls, data = self.parser(new_url) self.manager.add_new_url(new_urls) self.output.store_data(data) print('已经抓取了{}个连接'.format(self.manager.old_url_size())) except Exception: print('爬取失败') self.output.output_html()