async def fetch_single_page_and_save_mongo(self, session, url): """ 访问小说章节,并且存储到mongodb中 :param session: :param url: :return: """ try: async with session.get(url, timeout=15) as resp: status_code = resp.status if status_code != 200: self.db.add_to_wait(url) return text = await resp.text("gbk", "ignore") save_dict = Parser.parse_single_page(url, text) await self.mongodb.save_data(save_dict, url.split("/")[-2].split("_")[-1]) self.db.add_to_finish(self.db.hash_url(url)) crawler.info(f"get url: {url} status: {status_code}") except Exception: crawler.error(traceback.format_exc()) self.db.add_to_wait(url)
async def fetch_single_page_and_save_direct(self, session, url): """ 访问小说章节,并且直接存储 :param session: :param url: :return: """ async with session.get(url, timeout=15) as resp: try: status_code = resp.status # 失败重爬 if status_code != 200: self.db.add_to_wait(url) return text = await resp.read() # xpath解析所需内容 save_dict = Parser.parse_single_page(url, text) # 直接zlib压缩后存储 await DirectStorage.save_single_page(url.split("/")[-2].split("_")[-1], save_dict["chapter_name"], save_dict["content"]) # url的16进制MD5添加到redis完成集合中 self.db.add_to_finish(self.db.hash_url(url)) crawler.info(f"get url: {url} status: {status_code}") except Exception: crawler.error(traceback.format_exc()) self.db.add_to_wait(url)