class SpiderMan(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() # 实例化时连接到数据库 self.s = Settings().setting def crawl(self): self.output.create_table() # 创建表 total_page = self.s["Index"][1] - self.s["Index"][0] total_data = total_page * self.s["Page"] total_errors = 0 total_duplicates = 0 old_total = self.output.get_total() for Index in range(self.s["Index"][0], self.s["Index"][1]): duplicates = self.manager.add_urls(Index, self.output) urls = self.manager.get_urls() bar = pyprind.ProgBar(self.s["Page"] - duplicates, title="Crawling " + "Page " + str(Index) + " ......") # 进度条 for url in urls: try: bar.update() html = self.downloader.download(url) data = self.parser.parse(html) self.output.insert_into_db(data) # 插入数据库 except Exception: continue new_total = self.output.get_total() self.output.close_cursor() # 关闭数据库连接 print("本次爬取", new_total - old_total, "条")
class SpiderMan(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() # 实例化时连接到数据库 def crawl(self): self.output.create_table() # 创建表 self.manager.add_new_urls() # 创建url total = self.manager.new_urls_size() bar = pyprind.ProgBar(30, title="Crawling......") # 进度条 while (self.manager.new_urls_size()): url = self.manager.get_new_url() html = self.downloader.download(url) data = self.parser.parse(html) errors, errors_messages = self.output.insert_into_db( data) # 插入数据库 bar.update() ''' sys.stdout.write( str(self.manager.old_urls_size() / total * 100) + "%") sys.stdout.flush() # print('爬取', self.manager.old_urls_size(), '条。') ''' self.output.close_cursor() # 关闭数据库连接 print("本次共爬取", total, "条") if errors: print("其中", errors, "条数据出错") print("错误:" + str(errors_messages))