Beispiel #1
0
class BookColl(object):
    """
    书籍集合对象
    """
    def __init__(self, my_database):
        """
        书籍集合对象初始化

        :param my_database:
        """
        self._error_log = ErrorLog()  # 创建错误日志输出对象
        self._db = my_database  # 数据库对象
        self._books = self._db.book  # 获取book集合

    def insert_to_db(self, data):
        """
        插入数据到数据库

        :type data dict
        :param data: 需要插入的字典数据
        :return: None
        """
        try:
            self._books.insert_one(data)  # 插入数据
        except (errors, Exception) as e:
            self._error_log.write_error('BookColl插入错误' + e)  # 错误日志记录

    def get_book_name(self):
        """
        获取书籍名称
        :return:
        """
        # for result in self._books.find({'book_name': {'$regex': '\w'}}):
        for result in self._books.find():
            print(result['book_name'])
            with open('book_name.txt', 'a', encoding='utf-8') as f:
                f.write(result['book_name'] + '\n')
Beispiel #2
0
class UrlColl(object):
    """
    URL集合对象
    """
    def __init__(self, my_database):
        """
        URL集合对象

        :param my_database:
        """
        self._error_log = ErrorLog()  # 新建错误日志对象
        self._db = my_database  # 数据库对象
        self._urls = self._db.urls  # URL集合

    def add_url(self, url):
        """
        添加URL数据到集合中

        :param url:  URL数据
        :return: None
        """
        try:
            if not self.is_exist_url(url):  # 判断是否存在
                self._urls.insert_one({'url': url, 'isExist': 'false'})  # 插入
        except (errors, Exception) as e:
            self._error_log.write_error('UrlColl添加错误' + e)  # 错误日志写入

    def is_exist_url(self, url):
        """
        判读是否已经存在相对应的数据

        :param url: URL地址
        :return: boolean 存在返回True不存在返回False
        """
        try:
            result = self._urls.find_one({"url": url})  # 获取查询结果
            if result is None:
                return False  # 返回False
            else:
                return True  # 返回True
        except (errors, Exception) as e:
            self._error_log.write_error('UrlColl查找错误' + e)  # 错误日志写入

    def get_url(self):
        """
        从数据库中随机获取一条数据

        :return: URL地址字符串
        """
        num = randint(1, 100)  # 随机数
        try:
            result = self._urls.find({
                'isExist': 'false'
            }).skip(num).limit(1)  # 跳跃式获取数据
            return result[0]['url']  # 返回对应的URL地址
        except (errors, Exception) as e:
            self._error_log.write_error('UrlColl获取url错误' + e)  # 错误日志写入

    def update_url(self, url):
        """
        更新URL数据

        :param url: 需要更新的URL数据
        :return: None
        """
        try:
            self._urls.update({'url': url}, {'$set': {
                'isExist': 'true'
            }})  # 更新URL的状态为True表示已经爬取过了
        except (errors, Exception) as e:
            self._error_log.write_error('UrlColl更新URl数据错误' + e)  # 错误日志写入
Beispiel #3
0
class Crawler(object):
    def __init__(self, thread_count):
        """
        初始化爬虫对象

        :param thread_count: 线程数量统计对象
        """
        self._conn = MyDatabase()
        self._db = self._conn.database
        self._book_coll = BookColl(self._db)  # 初始化对象
        self._url_coll = UrlColl(self._db)
        self._thread_count = thread_count
        self._error_log = ErrorLog()  # 新建错误日志输出记录对象

    def get_book(self, url):
        """
        获取书籍数据

        :param url: 获取书籍的URL地址
        :return: None
        """
        book = {}  # 初始化字典 用于保存数据
        # 初始化浏览器驱动程序,获得浏览器驱动对象
        driver = webdriver.Firefox(
            executable_path='E:\DevelopTools\Python\geckodriver')
        # driver = webdriver.Ie(executable_path='E:\DevelopTools\Python\IEDriverServer')
        try:
            driver.set_page_load_timeout(12)  # 设置页面加载超时时间
            driver.set_script_timeout(30)  # 设置页面脚本响应超时时间
            driver.get(url)  # 设置浏览器获取页面的地址
            js = "var q=document.documentElement.scrollTop=100000"  # 浏览器执行的js代码 向下滑动100000xp
            driver.execute_script(js)  # 运行脚本
            time.sleep(1)  # 休眠等待浏览器执行
            js = "var q=document.documentElement.scrollTop=0"  # 浏览器js代码 回到顶部
            driver.execute_script(js)  # 运行脚本
            time.sleep(2)  # 休眠等待浏览器执行
            js = "var q=document.documentElement.scrollTop=100000"  # 浏览器js代码, 回到底部
            driver.execute_script(js)  # 运行脚本
            time.sleep(1)  # 休眠等待浏览器执行, 模拟浏览器滑动完成
            soup = BeautifulSoup(driver.page_source,
                                 "lxml")  # 传递页面数据, 初始化bs4对象
        except Exception as e:
            print(e)  # 输出错误信息
            self._error_log.write_error(e)  # 记录错误信息
            return  # 返回空
        finally:
            driver.close()  # 关闭浏览器
        # target = driver.find_element_by_id("footer")
        # driver.execute_script("arguments[0].scrollIntoView();", target)  # 拖动到可见的元素去

        # 下面是相关标签的数据获取
        null_wrap = soup.find("div", {"class": "null_wrap"})
        if not null_wrap is None:
            self._url_coll.update_url(url)
            return
        book['url'] = url
        book_name = soup.find("div", {"class": "name_info"})
        if book_name is None:
            self._url_coll.update_url(url)
            return
        book['book_name'] = book_name.h1.get_text(strip=True)
        book['image_url'] = soup.find("div", {"class": "big_pic"}).img['src']
        book['book_type'] = soup.find("div", {
            "class": "breadcrumb"
        }).get_text(strip=True)
        book['introduction'] = soup.find("span", {
            "class": "head_title_name"
        }).get_text(strip=True)
        author = soup.find("span", {"id": "author"})
        if author is None:
            book['author'] = ""
        else:
            book['author'] = soup.find("span", {"id": "author"}).text
        messbox = soup.find("div", {"class": "messbox_info"})
        for item in messbox:
            if "出版社" in str(item):
                book['publishing'] = item.get_text(strip=True)
            elif "出版时间" in str(item):
                book['publishing_time'] = item.get_text(strip=True)
        book['price'] = soup.find("p", {
            "id": "dd-price"
        }).get_text(strip=True).split("¥")[1]
        editors_choice = soup.find("div", {"id": "abstract"})
        if editors_choice is None:
            book['editors_choice'] = ""
        else:
            book['editors_choice'] = editors_choice.contents[1].get_text()
        content_validity = soup.find("div", {"id": "content"})
        if content_validity is None:
            book['content_validity'] = ""
        else:
            book['content_validity'] = content_validity.contents[1].get_text()
        about_author = soup.find("div", {"id": "authorIntroduction"})
        if about_author is None:
            book['about_author'] = ""
        else:
            book['about_author'] = about_author.contents[1].get_text()
        catalog = soup.find("textarea", {"id": "catalog-textarea"})
        if catalog is None:
            catalog2 = soup.find("div", {"id": "catalog"})
            if catalog2 is None:
                book['catalog'] = ""
            else:
                book['catalog'] = catalog2.contents[1].get_text()
        else:
            book['catalog'] = catalog.get_text(strip=True)
        media_reviews = soup.find("div", {"id": "mediaFeedback"})
        if media_reviews is None:
            book['media_reviews'] = ""
        else:
            book['media_reviews'] = media_reviews.get_text()
        # 数据获取成功,插入book集合
        self._book_coll.insert_to_db(book)
        self._conn.close_conn()
        print(url + "完成")
        try:
            self._thread_count.add_one()  # 线程计数加一
            thread = MyThread(soup, self._thread_count)  # 创建线程对象
            thread.start()  # 开启线程
        except Exception as e:
            self._error_log.write_error(e)  # 写入错误日志
            print("Error: 无法启动线程" + e)