Ejemplo n.º 1
0
Archivo: main.py Proyecto: iwenli/ebook
def spider_chapter(top=10):
    """抓取章节
    """
    _name = '分类书籍提取'
    log.info(f"[{_name}]模块开始处理")

    ebook_sesson = EBookSession()
    books = ebook_sesson.query(Book).filter_by(Process=False).limit(top).all()
    log.info(f"[{_name}]本次提取到书籍:{len(books)}")
    for book in books:
        log.info(f"[{_name}]处理书籍 {book.Name}")
        chapters = xbqg.get_chapters(book.Name)
        if chapters is None:
            continue
        models = []
        for index, chapter in enumerate(chapters):
            model = Chapter(book.Id, index + 1, chapter['name'],
                            chapter['url'])
            models.append(model)
            # ebook_sesson.add(model)
        if len(models) > 0:
            ebook_sesson.add_all(models)

        log.info(f"[{_name}]书籍提取到章节 {len(models)} 条")

        book.Process = True
        ebook_sesson.commit()
    ebook_sesson.close()
    log.info(f"[{_name}]模块处理完成...")
Ejemplo n.º 2
0
Archivo: main.py Proyecto: iwenli/ebook
def spider_book_info_by_category():
    """ 根据 category 抓取 book_info
    """

    _name = '分类书籍提取'
    log.info(f"[{_name}]模块开始处理")

    ebook_sesson = EBookSession()
    categories = cacheContext.get_all_category()
    log.info(f"[{_name}]分类数量:{len(categories)}")
    for category in categories:
        log.info(f"[{_name}]处理分类:{category.Name}")
        subId = category.Id
        id = category.ParentId
        if id == 0:
            id = subId
            subId = 0

        books = qdh5.get_books(category.Url)
        for book in books:
            if cacheContext.exists_book(book['name']) is True:
                continue
            # 不存在
            model = generate_book_model(**book)
            model.CategoryId = id
            model.SubCategoryId = subId

            log.info(f"[{_name}]提取到书籍:{model}")
            ebook_sesson.add(model)
        # 一次目录一次提交
        ebook_sesson.commit()
    ebook_sesson.close()
    log.info(f"[{_name}]模块处理完成...")
Ejemplo n.º 3
0
    def _init_data(self):
        """初始化数据
        """
        _fix = "全局"
        self.log.info(f"[{_fix}] 开始提取数据...")
        session = EBookSession()
        books = session.query(Book).filter(Book.Process == False).filter(
            Book.Id >= self.__book_id).limit(self.__limit).all()
        # chapters = session.query(Chapter).filter(Chapter.Status == 0).limit(
        #     self.__limit).all()
        session.close()

        for book in books:
            self.__book_queue.put(book)
        # for chapter in chapters:
        #     self.__chapter_queue.put(chapter)
        self.log.info(f"[{_fix}] 提取数据完成...")
Ejemplo n.º 4
0
Archivo: main.py Proyecto: iwenli/ebook
def download_chapter(top=10):
    """下载章节
    """
    _name = '下载章节'
    log.info(f"[{_name}]模块开始处理")

    ebook_sesson = EBookSession()
    chapters = ebook_sesson.query(Chapter).filter_by(Status=0).limit(top).all()

    log.info(f"[{_name}]提取到待下载章节 {len(chapters)}")
    for chapter in chapters:
        try:
            content = xbqg.get_chapter_content(chapter)
            if content is None:
                continue

            chapter.WordNums = len(content)
            chapter.Status = 1

            # 写入文件
            file.write_book(chapter.BookId, chapter.SerialNums, content)
            ebook_sesson.commit()
            log.info(f"[{_name}]章节下载完成 {chapter}")
        except Exception as ex:
            log.error(f"[{_name}]{chapter}异常 ", ex)

    ebook_sesson.close()
    log.info(f"[{_name}]模块处理完成...")
Ejemplo n.º 5
0
    def download_chapter(self, id):
        """下载章节

        Args:
            id ([int]): [线程序号]
        """

        fix = f"[下载章节(线程{id})]"
        self.log.info(f"{fix} 开始执行")

        session = EBookSession()
        index = 0
        while True:
            if not self.__book_queue.empty() and self.__chapter_queue.empty():
                self.log.warning(f"{fix} 队列为空,等待10秒后继续检测...")
                time.sleep(10)  # 没有任务  休息10秒
                continue
            if self.__book_queue.empty() and self.__chapter_queue.empty():
                break

            index += 1
            chapter = self.__chapter_queue.get()
            msg = f"{fix}[{index}/{self.__chapter_queue.qsize()}]下载章节 {chapter.Name}"
            try:
                content = xbqg.get_chapter_content(chapter)
                if content is None:
                    self.__chapter_queue.put(chapter)
                    self.log.warning(f"{msg} 下载内容为空,已经放到任务末尾等待重新执行")
                    continue

                chapter.WordNums = len(content)
                chapter.Status = 1

                # 写入文件
                file.write_book(chapter.BookId, chapter.SerialNums, content)

                # 更新 Chapter
                session.query(Chapter).filter(Chapter.Id == chapter.Id).update(
                    {
                        "WordNums": chapter.WordNums,
                        "Status": chapter.Status
                    })
                session.commit()

                self.log.info(f"{msg} 完成")
                self.__chapter_queue.task_done()
            except Exception:
                self.log.error(f"{msg} 异常", exc_info=True)
                self.__chapter_queue.put(chapter)
                self.log.warning(f"{msg} 异常,已经放到任务末尾等待重新执行")

        session.close()
        self.log.info(f"{fix} 执行完成")
Ejemplo n.º 6
0
 def refresh(self):
     """刷新缓存
     """
     ebook_sesson = EBookSession()
     CacheContext.book_names_cache = set(
         [book.Name for book in ebook_sesson.query(Book.Name).all()])
     CacheContext.categories_cache = ebook_sesson.query(Category).all()
     CacheContext.category_names_cache = set(
         [c.Name for c in CacheContext.categories_cache])
     ebook_sesson.close()
Ejemplo n.º 7
0
Archivo: main.py Proyecto: iwenli/ebook
def spider_book_info_by_task():
    """ 根据 task 抓取 book_info
    """
    _name = '书籍搜索'
    log.info(f"[{_name}]模块开始处理")
    ebook_sesson = EBookSession()
    tasks = ebook_sesson.query(BookTask).filter_by(Process=False).all()
    log.info(f"[{_name}]待处理数据 {len(tasks)} 条")
    for task in tasks:
        book_name = task.Name

        if cacheContext.exists_book(book_name) is False:
            # 不存在
            book = qdh5.get_book(task.Name)
            if book is None:
                continue
            model = generate_book_model(**book)

            log.info(f"[{_name}]提取到书籍:{book}")
            ebook_sesson.add(model)
        task.Process = True
    ebook_sesson.commit()
    ebook_sesson.close()
    log.info(f"[{_name}]模块处理完成...")
Ejemplo n.º 8
0
    def fether_chapter(self, id):
        """获取数据章节信息工作函数

        Args:
            id ([int]): [线程序号]
        """
        fix = f"[获取章节信息(线程{id})]"
        self.log.info(f"{fix} 开始执行")

        session = EBookSession()
        session.expire_on_commit = False  # !对象在commit后取消和session的关联,防止session过期后对象被销毁
        index = 0

        while not self.__book_queue.empty():
            book = self.__book_queue.get()
            index += 1
            msg = f"{fix}[{index}/{self.__book_queue.qsize()}]处理书籍 {book.Name}"
            try:
                chapters = xbqg.get_chapters(book.Name)
                if chapters is None:
                    continue
                total = 0
                for index, chapter in enumerate(chapters):
                    total = index + 1
                    model = Chapter(book.Id, total, chapter['name'],
                                    chapter['url'])
                    # 加入数据库
                    session.add(model)
                    session.commit()

                    # 加入到 queue
                    self.__chapter_queue.put(model)

                if total == 0:
                    continue

                # 更新书籍状态
                session.query(Book).filter(Book.Id == book.Id).update(
                    {"Process": True})
                session.commit()

                self.log.info(f"{msg} ,  提取到章节 {total} 条,已加入待下载任务")

            except Exception:
                self.log.error(f"{msg} 异常", exc_info=True)
                self.__book_queue.put(book)
                self.log.warning(f"{msg} 异常,已经放到任务末尾等待重新执行")

        session.close()
        self.log.info(f"{fix} 执行完成")
Ejemplo n.º 9
0
    def spider_book_info_by_task(self):
        """ 根据 task 抓取 book_info
        """
        _name = '书籍搜索'
        self.log.info(f"[{_name}]模块开始处理")
        session = EBookSession()
        session.expire_on_commit = False
        tasks = session.query(BookTask).filter_by(Process=False).all()
        self.log.info(f"[{_name}]待处理数据 {len(tasks)} 条")
        for task in tasks:
            book_name = task.Name

            if cacheContext.exists_book(book_name) is False:
                # 不存在
                book = qdh5.get_book(task.Name)
                if book is None:
                    continue
                model = self.generate_book_model(**book)

                self.log.info(f"[{_name}]提取到书籍:{book}")
                session.add(model)
                session.commit()

                # 加入到 queue
                self.__book_queue.put(model)

            task.Process = True
        session.commit()
        session.close()
        self.log.info(f"[{_name}]模块处理完成...")
Ejemplo n.º 10
0
Archivo: main.py Proyecto: iwenli/ebook
def spider_category():
    """抓取分类
    """
    _name = '抓取分类'
    log.info(f"[{_name}]模块开始处理")
    categories = qdh5.get_categories()
    log.info(f"抓取到根分类 {len(categories)} 条")
    ebook_sesson = EBookSession()
    for category in categories:
        # 一级分类
        categoryDb = cacheContext.get_category(category['name'])
        if categoryDb is None:
            categoryDb = Category(category['sex'], category['name'],
                                  category['url'])
            log.info(f"添加一级分类:{categoryDb}")
            ebook_sesson.add(categoryDb)
            ebook_sesson.commit()

        # 二级分类
        subCategories = category["subCategories"]
        for subCategory in subCategories:
            subCategoryDb = cacheContext.get_category(subCategory['name'])
            if subCategoryDb is None:
                subCategoryDb = Category(subCategory['sex'],
                                         subCategory['name'],
                                         subCategory['url'], categoryDb.Id)
                log.info(f"添加二级分类:{subCategoryDb}")
                ebook_sesson.add(subCategoryDb)
        ebook_sesson.commit()

    log.info(f"[{_name}]模块处理完成")
    ebook_sesson.close()