Ejemplo n.º 1
0
    def fix_mode(self):
        """
        修复模式,检查temp文件夹下内容 与 小说目录页 下载未完成章节并合成小说
        :return:
        """
        start_time = time.time()
        self.logger.info("------------------Fix Mode------------------")
        # 获取所有详细内容链接
        detail_urls = self._parse_catalog()
        redownload_urls = CommonTool.get_not_downloaded_chapters(detail_urls)
        self.logger.debug("redownload: " + str(redownload_urls))
        self.logger.info("Get novel chapters: " + str(len(redownload_urls)))
        self.all_chapter_num = len(redownload_urls)
        # 使用threadpool 控制多线程数量
        requests = threadpool.makeRequests(self._get_detail, redownload_urls)
        [self.pool.putRequest(req) for req in requests]
        # 等待所有章节抓取完成
        self.pool.wait()

        self.logger.info("Checking download completeness...")
        if CommonTool.check_completion(detail_urls):
            # 合并全文
            self.logger.info("All chapters are downloaded successfully. Start merging ...")
            CommonTool.merge_all_chapters(self.output_name)
            self.logger.info("Merged. Enjoy reading!")
        else:
            self.logger.error("Some chapters download failed.")
            self.logger.error("Try: python novel_download.py -url URL -t THREAD_LIMIT --fix=true")
        self.logger.info("Total cost %.2fs" % (time.time() - start_time))
Ejemplo n.º 2
0
 def _get_detail(self, detail_url):
     """
     获取详细章节详细内容 并写入 temp文件夹下 暂存
     :param detail_url:  章节链接
     :return: None
     """
     time.sleep(0.5)
     try:
         # this will raise FetchFailedException
         content = CommonTool.fetch_page(detail_url)
         # this will raise EmptyContentException
         result = self._check_parse_detail(content)
         # 小说章节末尾链接作为临时储存文件名
         filename = detail_url.split('/')[-1]
         # 暂存章节至文件
         CommonTool.save_chapter(filename, result)
         if self.lock.acquire():
             self.progress_cnt += 1
             self.lock.release()
             self._print_progress()
     except FetchFailedException as e:
         self.failed_set.add(detail_url)
         self.logger.debug("Fetch failed: " + detail_url + ". " + str(e))
     except EmptyContentException:
         self.failed_set.add(detail_url)
         self.logger.debug("Empty content: " + detail_url)
Ejemplo n.º 3
0
 def _parse_catalog(self):
     """
     请求self.url,获取小说目录页面内容
     :return: 所有详细页面的链接
     """
     result = CommonTool.fetch_page(self.catalog_url)
     doc = pq.PyQuery(result)
     # 内存去重
     detail_urls = set()
     # 模式1 https://www.kanunu8.com/book3/8257/
     for a in doc('table:nth-child(2) > tbody > tr > td > a').items():
         detail_url = urllib.request.urljoin(self.catalog_url, a.attr.href)
         if detail_url in detail_urls:
             # 去重
             continue
         if self.HOST not in detail_url:
             # 不是该站点链接
             continue
         detail_urls.add(detail_url)
     # 模式2 https://www.kanunu8.com/book2/10946/index.html
     for a in doc('div.col-left > div > dl > dd > a').items():
         detail_url = urllib.request.urljoin(self.catalog_url, a.attr.href)
         if detail_url in detail_urls:
             # 去重
             continue
         if self.HOST not in detail_url:
             # 不是该站点链接
             continue
         detail_urls.add(detail_url)
     return detail_urls
Ejemplo n.º 4
0
    def start(self):
        """
        解析目录页 尝试爬取所有章节 暂存至temp文件夹
        校验下载完整后,合成小说文件
        若下载不完整,则退出。 使用 --fix=true 参数进入修复模式
        :return:
        """
        start_time = time.time()
        # 先清除临时文件
        CommonTool.clean_temp()
        # 获取所有详细内容链接
        detail_urls = self._parse_catalog()
        self.logger.info("Get novel chapters: " + str(len(detail_urls)))
        self.all_chapter_num = len(detail_urls)
        # 使用threadpool 控制多线程数量
        requests = threadpool.makeRequests(self._get_detail, detail_urls)
        [self.pool.putRequest(req) for req in requests]
        # 等待所有章节抓取完成
        self.pool.wait()

        retry_max = 3
        retry_cnt = 0
        # 进行3次重试, 若无法下载完整,使用 --fix 模式
        while (self.progress_cnt < self.all_chapter_num) and (retry_cnt < retry_max):
            retry_cnt += 1
            self.logger.info("Retry failed set. Len: " + str(len(self.failed_set)))
            retry, self.failed_set = self.failed_set, set()
            requests = threadpool.makeRequests(self._get_detail, retry)
            [self.pool.putRequest(req) for req in requests]
            # 等待所有章节抓取完成
            self.pool.wait()

        self.logger.info("Checking download completeness...")
        if CommonTool.check_completion(detail_urls):
            # 合并全文
            self.logger.info("All chapters are downloaded successfully. Start merging ...")
            CommonTool.merge_all_chapters(self.output_name)
            self.logger.info("Merged. Enjoy reading!")
        else:
            self.logger.error("Some chapters download failed.")
            self.logger.error("Try python novel_download.py -url URL --fix")
        self.logger.info("Total cost %.2fs" % (time.time() - start_time))
Ejemplo n.º 5
0
 def _parse_detail(content):
     """
     解析页面详细内容,提取并返回 标题+正文
     :param content:  小说内容页面
     :return: 标题+正文
     """
     doc = pq.PyQuery(content)
     title = doc(
         '#wrapper > div.content_read > div > div.bookname > h1').text()
     title = CommonTool.fix_title(title)
     content = doc('#content').text()
     return title, content
Ejemplo n.º 6
0
 def _parse_detail(content):
     """
     解析页面详细内容,提取并返回 标题+正文
     :param content:  小说内容页面
     :return: 标题+正文
     """
     doc = pq.PyQuery(content)
     title = doc('#directs > div.bookInfo > h1 > strong').text().replace(
         "正文", "").strip()
     title = CommonTool.fix_title(title)
     content = doc('#content').text()
     content = content.replace('style6();', '').replace('style5();', '')
     return title, content
Ejemplo n.º 7
0
 def _parse_catalog(self):
     """
     请求self.url,获取小说目录页面内容
     :return: 所有详细页面的链接
     """
     result = CommonTool.fetch_page(self.catalog_url)
     doc = pq.PyQuery(result)
     # 内存去重
     detail_urls = set()
     for a in doc('#list > dl > dd > a').items():
         detail_url = a.attr.href
         if detail_url in detail_urls:
             # 去重
             continue
         detail_url = urllib.request.urljoin(self.HOST, detail_url)
         detail_urls.add(detail_url)
     return detail_urls
Ejemplo n.º 8
0
 def _parse_catalog(self):
     """
     请求self.url,获取小说目录页面内容
     :return: 所有详细页面的链接
     """
     result = CommonTool.fetch_page(self.catalog_url)
     doc = pq.PyQuery(result)
     # 内存去重
     detail_urls = set()
     for a in doc(
             '#chapter > div.chapterSo > div.chapterNum > ul > div.clearfix.dirconone  li > a'
     ).items():
         detail_url = a.attr.href
         if detail_url in detail_urls:
             # 去重
             continue
         if self.HOST not in detail_url:
             # 不是该站点链接
             continue
         detail_urls.add(detail_url)
     return detail_urls