def spider_chapter(top=10): """抓取章节 """ _name = '分类书籍提取' log.info(f"[{_name}]模块开始处理") ebook_sesson = EBookSession() books = ebook_sesson.query(Book).filter_by(Process=False).limit(top).all() log.info(f"[{_name}]本次提取到书籍:{len(books)}") for book in books: log.info(f"[{_name}]处理书籍 {book.Name}") chapters = xbqg.get_chapters(book.Name) if chapters is None: continue models = [] for index, chapter in enumerate(chapters): model = Chapter(book.Id, index + 1, chapter['name'], chapter['url']) models.append(model) # ebook_sesson.add(model) if len(models) > 0: ebook_sesson.add_all(models) log.info(f"[{_name}]书籍提取到章节 {len(models)} 条") book.Process = True ebook_sesson.commit() ebook_sesson.close() log.info(f"[{_name}]模块处理完成...")
def spider_book_info_by_category(): """ 根据 category 抓取 book_info """ _name = '分类书籍提取' log.info(f"[{_name}]模块开始处理") ebook_sesson = EBookSession() categories = cacheContext.get_all_category() log.info(f"[{_name}]分类数量:{len(categories)}") for category in categories: log.info(f"[{_name}]处理分类:{category.Name}") subId = category.Id id = category.ParentId if id == 0: id = subId subId = 0 books = qdh5.get_books(category.Url) for book in books: if cacheContext.exists_book(book['name']) is True: continue # 不存在 model = generate_book_model(**book) model.CategoryId = id model.SubCategoryId = subId log.info(f"[{_name}]提取到书籍:{model}") ebook_sesson.add(model) # 一次目录一次提交 ebook_sesson.commit() ebook_sesson.close() log.info(f"[{_name}]模块处理完成...")
def _init_data(self): """初始化数据 """ _fix = "全局" self.log.info(f"[{_fix}] 开始提取数据...") session = EBookSession() books = session.query(Book).filter(Book.Process == False).filter( Book.Id >= self.__book_id).limit(self.__limit).all() # chapters = session.query(Chapter).filter(Chapter.Status == 0).limit( # self.__limit).all() session.close() for book in books: self.__book_queue.put(book) # for chapter in chapters: # self.__chapter_queue.put(chapter) self.log.info(f"[{_fix}] 提取数据完成...")
def download_chapter(top=10): """下载章节 """ _name = '下载章节' log.info(f"[{_name}]模块开始处理") ebook_sesson = EBookSession() chapters = ebook_sesson.query(Chapter).filter_by(Status=0).limit(top).all() log.info(f"[{_name}]提取到待下载章节 {len(chapters)}") for chapter in chapters: try: content = xbqg.get_chapter_content(chapter) if content is None: continue chapter.WordNums = len(content) chapter.Status = 1 # 写入文件 file.write_book(chapter.BookId, chapter.SerialNums, content) ebook_sesson.commit() log.info(f"[{_name}]章节下载完成 {chapter}") except Exception as ex: log.error(f"[{_name}]{chapter}异常 ", ex) ebook_sesson.close() log.info(f"[{_name}]模块处理完成...")
def download_chapter(self, id): """下载章节 Args: id ([int]): [线程序号] """ fix = f"[下载章节(线程{id})]" self.log.info(f"{fix} 开始执行") session = EBookSession() index = 0 while True: if not self.__book_queue.empty() and self.__chapter_queue.empty(): self.log.warning(f"{fix} 队列为空,等待10秒后继续检测...") time.sleep(10) # 没有任务 休息10秒 continue if self.__book_queue.empty() and self.__chapter_queue.empty(): break index += 1 chapter = self.__chapter_queue.get() msg = f"{fix}[{index}/{self.__chapter_queue.qsize()}]下载章节 {chapter.Name}" try: content = xbqg.get_chapter_content(chapter) if content is None: self.__chapter_queue.put(chapter) self.log.warning(f"{msg} 下载内容为空,已经放到任务末尾等待重新执行") continue chapter.WordNums = len(content) chapter.Status = 1 # 写入文件 file.write_book(chapter.BookId, chapter.SerialNums, content) # 更新 Chapter session.query(Chapter).filter(Chapter.Id == chapter.Id).update( { "WordNums": chapter.WordNums, "Status": chapter.Status }) session.commit() self.log.info(f"{msg} 完成") self.__chapter_queue.task_done() except Exception: self.log.error(f"{msg} 异常", exc_info=True) self.__chapter_queue.put(chapter) self.log.warning(f"{msg} 异常,已经放到任务末尾等待重新执行") session.close() self.log.info(f"{fix} 执行完成")
def refresh(self): """刷新缓存 """ ebook_sesson = EBookSession() CacheContext.book_names_cache = set( [book.Name for book in ebook_sesson.query(Book.Name).all()]) CacheContext.categories_cache = ebook_sesson.query(Category).all() CacheContext.category_names_cache = set( [c.Name for c in CacheContext.categories_cache]) ebook_sesson.close()
def spider_book_info_by_task(): """ 根据 task 抓取 book_info """ _name = '书籍搜索' log.info(f"[{_name}]模块开始处理") ebook_sesson = EBookSession() tasks = ebook_sesson.query(BookTask).filter_by(Process=False).all() log.info(f"[{_name}]待处理数据 {len(tasks)} 条") for task in tasks: book_name = task.Name if cacheContext.exists_book(book_name) is False: # 不存在 book = qdh5.get_book(task.Name) if book is None: continue model = generate_book_model(**book) log.info(f"[{_name}]提取到书籍:{book}") ebook_sesson.add(model) task.Process = True ebook_sesson.commit() ebook_sesson.close() log.info(f"[{_name}]模块处理完成...")
def fether_chapter(self, id): """获取数据章节信息工作函数 Args: id ([int]): [线程序号] """ fix = f"[获取章节信息(线程{id})]" self.log.info(f"{fix} 开始执行") session = EBookSession() session.expire_on_commit = False # !对象在commit后取消和session的关联,防止session过期后对象被销毁 index = 0 while not self.__book_queue.empty(): book = self.__book_queue.get() index += 1 msg = f"{fix}[{index}/{self.__book_queue.qsize()}]处理书籍 {book.Name}" try: chapters = xbqg.get_chapters(book.Name) if chapters is None: continue total = 0 for index, chapter in enumerate(chapters): total = index + 1 model = Chapter(book.Id, total, chapter['name'], chapter['url']) # 加入数据库 session.add(model) session.commit() # 加入到 queue self.__chapter_queue.put(model) if total == 0: continue # 更新书籍状态 session.query(Book).filter(Book.Id == book.Id).update( {"Process": True}) session.commit() self.log.info(f"{msg} , 提取到章节 {total} 条,已加入待下载任务") except Exception: self.log.error(f"{msg} 异常", exc_info=True) self.__book_queue.put(book) self.log.warning(f"{msg} 异常,已经放到任务末尾等待重新执行") session.close() self.log.info(f"{fix} 执行完成")
def spider_book_info_by_task(self): """ 根据 task 抓取 book_info """ _name = '书籍搜索' self.log.info(f"[{_name}]模块开始处理") session = EBookSession() session.expire_on_commit = False tasks = session.query(BookTask).filter_by(Process=False).all() self.log.info(f"[{_name}]待处理数据 {len(tasks)} 条") for task in tasks: book_name = task.Name if cacheContext.exists_book(book_name) is False: # 不存在 book = qdh5.get_book(task.Name) if book is None: continue model = self.generate_book_model(**book) self.log.info(f"[{_name}]提取到书籍:{book}") session.add(model) session.commit() # 加入到 queue self.__book_queue.put(model) task.Process = True session.commit() session.close() self.log.info(f"[{_name}]模块处理完成...")
def spider_category(): """抓取分类 """ _name = '抓取分类' log.info(f"[{_name}]模块开始处理") categories = qdh5.get_categories() log.info(f"抓取到根分类 {len(categories)} 条") ebook_sesson = EBookSession() for category in categories: # 一级分类 categoryDb = cacheContext.get_category(category['name']) if categoryDb is None: categoryDb = Category(category['sex'], category['name'], category['url']) log.info(f"添加一级分类:{categoryDb}") ebook_sesson.add(categoryDb) ebook_sesson.commit() # 二级分类 subCategories = category["subCategories"] for subCategory in subCategories: subCategoryDb = cacheContext.get_category(subCategory['name']) if subCategoryDb is None: subCategoryDb = Category(subCategory['sex'], subCategory['name'], subCategory['url'], categoryDb.Id) log.info(f"添加二级分类:{subCategoryDb}") ebook_sesson.add(subCategoryDb) ebook_sesson.commit() log.info(f"[{_name}]模块处理完成") ebook_sesson.close()