def spider_book_info_by_task(self): """ 根据 task 抓取 book_info """ _name = '书籍搜索' self.log.info(f"[{_name}]模块开始处理") session = EBookSession() session.expire_on_commit = False tasks = session.query(BookTask).filter_by(Process=False).all() self.log.info(f"[{_name}]待处理数据 {len(tasks)} 条") for task in tasks: book_name = task.Name if cacheContext.exists_book(book_name) is False: # 不存在 book = qdh5.get_book(task.Name) if book is None: continue model = self.generate_book_model(**book) self.log.info(f"[{_name}]提取到书籍:{book}") session.add(model) session.commit() # 加入到 queue self.__book_queue.put(model) task.Process = True session.commit() session.close() self.log.info(f"[{_name}]模块处理完成...")
def spider_category(): """抓取分类 """ _name = '抓取分类' log.info(f"[{_name}]模块开始处理") categories = qdh5.get_categories() log.info(f"抓取到根分类 {len(categories)} 条") ebook_sesson = EBookSession() for category in categories: # 一级分类 categoryDb = cacheContext.get_category(category['name']) if categoryDb is None: categoryDb = Category(category['sex'], category['name'], category['url']) log.info(f"添加一级分类:{categoryDb}") ebook_sesson.add(categoryDb) ebook_sesson.commit() # 二级分类 subCategories = category["subCategories"] for subCategory in subCategories: subCategoryDb = cacheContext.get_category(subCategory['name']) if subCategoryDb is None: subCategoryDb = Category(subCategory['sex'], subCategory['name'], subCategory['url'], categoryDb.Id) log.info(f"添加二级分类:{subCategoryDb}") ebook_sesson.add(subCategoryDb) ebook_sesson.commit() log.info(f"[{_name}]模块处理完成") ebook_sesson.close()
def spider_book_info_by_category(): """ 根据 category 抓取 book_info """ _name = '分类书籍提取' log.info(f"[{_name}]模块开始处理") ebook_sesson = EBookSession() categories = cacheContext.get_all_category() log.info(f"[{_name}]分类数量:{len(categories)}") for category in categories: log.info(f"[{_name}]处理分类:{category.Name}") subId = category.Id id = category.ParentId if id == 0: id = subId subId = 0 books = qdh5.get_books(category.Url) for book in books: if cacheContext.exists_book(book['name']) is True: continue # 不存在 model = generate_book_model(**book) model.CategoryId = id model.SubCategoryId = subId log.info(f"[{_name}]提取到书籍:{model}") ebook_sesson.add(model) # 一次目录一次提交 ebook_sesson.commit() ebook_sesson.close() log.info(f"[{_name}]模块处理完成...")
def fether_chapter(self, id): """获取数据章节信息工作函数 Args: id ([int]): [线程序号] """ fix = f"[获取章节信息(线程{id})]" self.log.info(f"{fix} 开始执行") session = EBookSession() session.expire_on_commit = False # !对象在commit后取消和session的关联,防止session过期后对象被销毁 index = 0 while not self.__book_queue.empty(): book = self.__book_queue.get() index += 1 msg = f"{fix}[{index}/{self.__book_queue.qsize()}]处理书籍 {book.Name}" try: chapters = xbqg.get_chapters(book.Name) if chapters is None: continue total = 0 for index, chapter in enumerate(chapters): total = index + 1 model = Chapter(book.Id, total, chapter['name'], chapter['url']) # 加入数据库 session.add(model) session.commit() # 加入到 queue self.__chapter_queue.put(model) if total == 0: continue # 更新书籍状态 session.query(Book).filter(Book.Id == book.Id).update( {"Process": True}) session.commit() self.log.info(f"{msg} , 提取到章节 {total} 条,已加入待下载任务") except Exception: self.log.error(f"{msg} 异常", exc_info=True) self.__book_queue.put(book) self.log.warning(f"{msg} 异常,已经放到任务末尾等待重新执行") session.close() self.log.info(f"{fix} 执行完成")
def spider_book_info_by_task(): """ 根据 task 抓取 book_info """ _name = '书籍搜索' log.info(f"[{_name}]模块开始处理") ebook_sesson = EBookSession() tasks = ebook_sesson.query(BookTask).filter_by(Process=False).all() log.info(f"[{_name}]待处理数据 {len(tasks)} 条") for task in tasks: book_name = task.Name if cacheContext.exists_book(book_name) is False: # 不存在 book = qdh5.get_book(task.Name) if book is None: continue model = generate_book_model(**book) log.info(f"[{_name}]提取到书籍:{book}") ebook_sesson.add(model) task.Process = True ebook_sesson.commit() ebook_sesson.close() log.info(f"[{_name}]模块处理完成...")