Esempio n. 1
0
    def spider_book_info_by_task(self):
        """ 根据 task 抓取 book_info
        """
        _name = '书籍搜索'
        self.log.info(f"[{_name}]模块开始处理")
        session = EBookSession()
        session.expire_on_commit = False
        tasks = session.query(BookTask).filter_by(Process=False).all()
        self.log.info(f"[{_name}]待处理数据 {len(tasks)} 条")
        for task in tasks:
            book_name = task.Name

            if cacheContext.exists_book(book_name) is False:
                # 不存在
                book = qdh5.get_book(task.Name)
                if book is None:
                    continue
                model = self.generate_book_model(**book)

                self.log.info(f"[{_name}]提取到书籍:{book}")
                session.add(model)
                session.commit()

                # 加入到 queue
                self.__book_queue.put(model)

            task.Process = True
        session.commit()
        session.close()
        self.log.info(f"[{_name}]模块处理完成...")
Esempio n. 2
0
File: main.py Progetto: iwenli/ebook
def spider_category():
    """抓取分类
    """
    _name = '抓取分类'
    log.info(f"[{_name}]模块开始处理")
    categories = qdh5.get_categories()
    log.info(f"抓取到根分类 {len(categories)} 条")
    ebook_sesson = EBookSession()
    for category in categories:
        # 一级分类
        categoryDb = cacheContext.get_category(category['name'])
        if categoryDb is None:
            categoryDb = Category(category['sex'], category['name'],
                                  category['url'])
            log.info(f"添加一级分类:{categoryDb}")
            ebook_sesson.add(categoryDb)
            ebook_sesson.commit()

        # 二级分类
        subCategories = category["subCategories"]
        for subCategory in subCategories:
            subCategoryDb = cacheContext.get_category(subCategory['name'])
            if subCategoryDb is None:
                subCategoryDb = Category(subCategory['sex'],
                                         subCategory['name'],
                                         subCategory['url'], categoryDb.Id)
                log.info(f"添加二级分类:{subCategoryDb}")
                ebook_sesson.add(subCategoryDb)
        ebook_sesson.commit()

    log.info(f"[{_name}]模块处理完成")
    ebook_sesson.close()
Esempio n. 3
0
File: main.py Progetto: iwenli/ebook
def spider_book_info_by_category():
    """ 根据 category 抓取 book_info
    """

    _name = '分类书籍提取'
    log.info(f"[{_name}]模块开始处理")

    ebook_sesson = EBookSession()
    categories = cacheContext.get_all_category()
    log.info(f"[{_name}]分类数量:{len(categories)}")
    for category in categories:
        log.info(f"[{_name}]处理分类:{category.Name}")
        subId = category.Id
        id = category.ParentId
        if id == 0:
            id = subId
            subId = 0

        books = qdh5.get_books(category.Url)
        for book in books:
            if cacheContext.exists_book(book['name']) is True:
                continue
            # 不存在
            model = generate_book_model(**book)
            model.CategoryId = id
            model.SubCategoryId = subId

            log.info(f"[{_name}]提取到书籍:{model}")
            ebook_sesson.add(model)
        # 一次目录一次提交
        ebook_sesson.commit()
    ebook_sesson.close()
    log.info(f"[{_name}]模块处理完成...")
Esempio n. 4
0
    def fether_chapter(self, id):
        """获取数据章节信息工作函数

        Args:
            id ([int]): [线程序号]
        """
        fix = f"[获取章节信息(线程{id})]"
        self.log.info(f"{fix} 开始执行")

        session = EBookSession()
        session.expire_on_commit = False  # !对象在commit后取消和session的关联,防止session过期后对象被销毁
        index = 0

        while not self.__book_queue.empty():
            book = self.__book_queue.get()
            index += 1
            msg = f"{fix}[{index}/{self.__book_queue.qsize()}]处理书籍 {book.Name}"
            try:
                chapters = xbqg.get_chapters(book.Name)
                if chapters is None:
                    continue
                total = 0
                for index, chapter in enumerate(chapters):
                    total = index + 1
                    model = Chapter(book.Id, total, chapter['name'],
                                    chapter['url'])
                    # 加入数据库
                    session.add(model)
                    session.commit()

                    # 加入到 queue
                    self.__chapter_queue.put(model)

                if total == 0:
                    continue

                # 更新书籍状态
                session.query(Book).filter(Book.Id == book.Id).update(
                    {"Process": True})
                session.commit()

                self.log.info(f"{msg} ,  提取到章节 {total} 条,已加入待下载任务")

            except Exception:
                self.log.error(f"{msg} 异常", exc_info=True)
                self.__book_queue.put(book)
                self.log.warning(f"{msg} 异常,已经放到任务末尾等待重新执行")

        session.close()
        self.log.info(f"{fix} 执行完成")
Esempio n. 5
0
File: main.py Progetto: iwenli/ebook
def spider_book_info_by_task():
    """ 根据 task 抓取 book_info
    """
    _name = '书籍搜索'
    log.info(f"[{_name}]模块开始处理")
    ebook_sesson = EBookSession()
    tasks = ebook_sesson.query(BookTask).filter_by(Process=False).all()
    log.info(f"[{_name}]待处理数据 {len(tasks)} 条")
    for task in tasks:
        book_name = task.Name

        if cacheContext.exists_book(book_name) is False:
            # 不存在
            book = qdh5.get_book(task.Name)
            if book is None:
                continue
            model = generate_book_model(**book)

            log.info(f"[{_name}]提取到书籍:{book}")
            ebook_sesson.add(model)
        task.Process = True
    ebook_sesson.commit()
    ebook_sesson.close()
    log.info(f"[{_name}]模块处理完成...")