def fix_mode(self): """ 修复模式,检查temp文件夹下内容 与 小说目录页 下载未完成章节并合成小说 :return: """ start_time = time.time() self.logger.info("------------------Fix Mode------------------") # 获取所有详细内容链接 detail_urls = self._parse_catalog() redownload_urls = CommonTool.get_not_downloaded_chapters(detail_urls) self.logger.debug("redownload: " + str(redownload_urls)) self.logger.info("Get novel chapters: " + str(len(redownload_urls))) self.all_chapter_num = len(redownload_urls) # 使用threadpool 控制多线程数量 requests = threadpool.makeRequests(self._get_detail, redownload_urls) [self.pool.putRequest(req) for req in requests] # 等待所有章节抓取完成 self.pool.wait() self.logger.info("Checking download completeness...") if CommonTool.check_completion(detail_urls): # 合并全文 self.logger.info("All chapters are downloaded successfully. Start merging ...") CommonTool.merge_all_chapters(self.output_name) self.logger.info("Merged. Enjoy reading!") else: self.logger.error("Some chapters download failed.") self.logger.error("Try: python novel_download.py -url URL -t THREAD_LIMIT --fix=true") self.logger.info("Total cost %.2fs" % (time.time() - start_time))
def start(self): """ 解析目录页 尝试爬取所有章节 暂存至temp文件夹 校验下载完整后,合成小说文件 若下载不完整,则退出。 使用 --fix=true 参数进入修复模式 :return: """ start_time = time.time() # 先清除临时文件 CommonTool.clean_temp() # 获取所有详细内容链接 detail_urls = self._parse_catalog() self.logger.info("Get novel chapters: " + str(len(detail_urls))) self.all_chapter_num = len(detail_urls) # 使用threadpool 控制多线程数量 requests = threadpool.makeRequests(self._get_detail, detail_urls) [self.pool.putRequest(req) for req in requests] # 等待所有章节抓取完成 self.pool.wait() retry_max = 3 retry_cnt = 0 # 进行3次重试, 若无法下载完整,使用 --fix 模式 while (self.progress_cnt < self.all_chapter_num) and (retry_cnt < retry_max): retry_cnt += 1 self.logger.info("Retry failed set. Len: " + str(len(self.failed_set))) retry, self.failed_set = self.failed_set, set() requests = threadpool.makeRequests(self._get_detail, retry) [self.pool.putRequest(req) for req in requests] # 等待所有章节抓取完成 self.pool.wait() self.logger.info("Checking download completeness...") if CommonTool.check_completion(detail_urls): # 合并全文 self.logger.info("All chapters are downloaded successfully. Start merging ...") CommonTool.merge_all_chapters(self.output_name) self.logger.info("Merged. Enjoy reading!") else: self.logger.error("Some chapters download failed.") self.logger.error("Try python novel_download.py -url URL --fix") self.logger.info("Total cost %.2fs" % (time.time() - start_time))