def spider(self, url, param): page_num = HtmlParser.get_page_num(url) print('page_num:', page_num) with open('./name.csv', 'a') as csvfile: fielddnames = ['title', 'url', 'down'] write = csv.DictWriter(csvfile, fieldnames=fielddnames) write.writeheader() for i in range(1, page_num + 1): page_url = url + param + str(i) print(page_url) new_urls = HtmlParser.get_page_urls(page_url) self.manage.add_new_urls(new_urls) while self.manage.has_new_url(): try: new_url = self.manage.get_new_url() data = HtmlParser.get_data(new_url) DataOutput.write_data(data) print(data) except Exception as e: print('抓取失败!error:', e) print('已经抓取{}条数据'.format(self.manage.old_urls_size()))
class Spider(object): def __init__(self): self.manage = UrlManager() self.output = DataOutput() self.parse = HtmlParser() def crawl(self): print(self.parse.page_num) for i in range(1, self.parse.page_num + 1): new_urls = self.parse.get_page_urls(i) print(new_urls) self.manage.add_new_urls(new_urls) while self.manage.has_new_url(): new_url = '' try: new_url = self.manage.get_new_url() print(new_url) data = self.parse.get_data(new_url) print(data) self.output.save_mongo(data) time.sleep(1) except Exception as e: print('抓取失败:', new_url, e) print('已经抓取{}条数据'.format(self.output.data_size()))