class BookColl(object): """ 书籍集合对象 """ def __init__(self, my_database): """ 书籍集合对象初始化 :param my_database: """ self._error_log = ErrorLog() # 创建错误日志输出对象 self._db = my_database # 数据库对象 self._books = self._db.book # 获取book集合 def insert_to_db(self, data): """ 插入数据到数据库 :type data dict :param data: 需要插入的字典数据 :return: None """ try: self._books.insert_one(data) # 插入数据 except (errors, Exception) as e: self._error_log.write_error('BookColl插入错误' + e) # 错误日志记录 def get_book_name(self): """ 获取书籍名称 :return: """ # for result in self._books.find({'book_name': {'$regex': '\w'}}): for result in self._books.find(): print(result['book_name']) with open('book_name.txt', 'a', encoding='utf-8') as f: f.write(result['book_name'] + '\n')
class UrlColl(object): """ URL集合对象 """ def __init__(self, my_database): """ URL集合对象 :param my_database: """ self._error_log = ErrorLog() # 新建错误日志对象 self._db = my_database # 数据库对象 self._urls = self._db.urls # URL集合 def add_url(self, url): """ 添加URL数据到集合中 :param url: URL数据 :return: None """ try: if not self.is_exist_url(url): # 判断是否存在 self._urls.insert_one({'url': url, 'isExist': 'false'}) # 插入 except (errors, Exception) as e: self._error_log.write_error('UrlColl添加错误' + e) # 错误日志写入 def is_exist_url(self, url): """ 判读是否已经存在相对应的数据 :param url: URL地址 :return: boolean 存在返回True不存在返回False """ try: result = self._urls.find_one({"url": url}) # 获取查询结果 if result is None: return False # 返回False else: return True # 返回True except (errors, Exception) as e: self._error_log.write_error('UrlColl查找错误' + e) # 错误日志写入 def get_url(self): """ 从数据库中随机获取一条数据 :return: URL地址字符串 """ num = randint(1, 100) # 随机数 try: result = self._urls.find({ 'isExist': 'false' }).skip(num).limit(1) # 跳跃式获取数据 return result[0]['url'] # 返回对应的URL地址 except (errors, Exception) as e: self._error_log.write_error('UrlColl获取url错误' + e) # 错误日志写入 def update_url(self, url): """ 更新URL数据 :param url: 需要更新的URL数据 :return: None """ try: self._urls.update({'url': url}, {'$set': { 'isExist': 'true' }}) # 更新URL的状态为True表示已经爬取过了 except (errors, Exception) as e: self._error_log.write_error('UrlColl更新URl数据错误' + e) # 错误日志写入
class Crawler(object): def __init__(self, thread_count): """ 初始化爬虫对象 :param thread_count: 线程数量统计对象 """ self._conn = MyDatabase() self._db = self._conn.database self._book_coll = BookColl(self._db) # 初始化对象 self._url_coll = UrlColl(self._db) self._thread_count = thread_count self._error_log = ErrorLog() # 新建错误日志输出记录对象 def get_book(self, url): """ 获取书籍数据 :param url: 获取书籍的URL地址 :return: None """ book = {} # 初始化字典 用于保存数据 # 初始化浏览器驱动程序,获得浏览器驱动对象 driver = webdriver.Firefox( executable_path='E:\DevelopTools\Python\geckodriver') # driver = webdriver.Ie(executable_path='E:\DevelopTools\Python\IEDriverServer') try: driver.set_page_load_timeout(12) # 设置页面加载超时时间 driver.set_script_timeout(30) # 设置页面脚本响应超时时间 driver.get(url) # 设置浏览器获取页面的地址 js = "var q=document.documentElement.scrollTop=100000" # 浏览器执行的js代码 向下滑动100000xp driver.execute_script(js) # 运行脚本 time.sleep(1) # 休眠等待浏览器执行 js = "var q=document.documentElement.scrollTop=0" # 浏览器js代码 回到顶部 driver.execute_script(js) # 运行脚本 time.sleep(2) # 休眠等待浏览器执行 js = "var q=document.documentElement.scrollTop=100000" # 浏览器js代码, 回到底部 driver.execute_script(js) # 运行脚本 time.sleep(1) # 休眠等待浏览器执行, 模拟浏览器滑动完成 soup = BeautifulSoup(driver.page_source, "lxml") # 传递页面数据, 初始化bs4对象 except Exception as e: print(e) # 输出错误信息 self._error_log.write_error(e) # 记录错误信息 return # 返回空 finally: driver.close() # 关闭浏览器 # target = driver.find_element_by_id("footer") # driver.execute_script("arguments[0].scrollIntoView();", target) # 拖动到可见的元素去 # 下面是相关标签的数据获取 null_wrap = soup.find("div", {"class": "null_wrap"}) if not null_wrap is None: self._url_coll.update_url(url) return book['url'] = url book_name = soup.find("div", {"class": "name_info"}) if book_name is None: self._url_coll.update_url(url) return book['book_name'] = book_name.h1.get_text(strip=True) book['image_url'] = soup.find("div", {"class": "big_pic"}).img['src'] book['book_type'] = soup.find("div", { "class": "breadcrumb" }).get_text(strip=True) book['introduction'] = soup.find("span", { "class": "head_title_name" }).get_text(strip=True) author = soup.find("span", {"id": "author"}) if author is None: book['author'] = "" else: book['author'] = soup.find("span", {"id": "author"}).text messbox = soup.find("div", {"class": "messbox_info"}) for item in messbox: if "出版社" in str(item): book['publishing'] = item.get_text(strip=True) elif "出版时间" in str(item): book['publishing_time'] = item.get_text(strip=True) book['price'] = soup.find("p", { "id": "dd-price" }).get_text(strip=True).split("¥")[1] editors_choice = soup.find("div", {"id": "abstract"}) if editors_choice is None: book['editors_choice'] = "" else: book['editors_choice'] = editors_choice.contents[1].get_text() content_validity = soup.find("div", {"id": "content"}) if content_validity is None: book['content_validity'] = "" else: book['content_validity'] = content_validity.contents[1].get_text() about_author = soup.find("div", {"id": "authorIntroduction"}) if about_author is None: book['about_author'] = "" else: book['about_author'] = about_author.contents[1].get_text() catalog = soup.find("textarea", {"id": "catalog-textarea"}) if catalog is None: catalog2 = soup.find("div", {"id": "catalog"}) if catalog2 is None: book['catalog'] = "" else: book['catalog'] = catalog2.contents[1].get_text() else: book['catalog'] = catalog.get_text(strip=True) media_reviews = soup.find("div", {"id": "mediaFeedback"}) if media_reviews is None: book['media_reviews'] = "" else: book['media_reviews'] = media_reviews.get_text() # 数据获取成功,插入book集合 self._book_coll.insert_to_db(book) self._conn.close_conn() print(url + "完成") try: self._thread_count.add_one() # 线程计数加一 thread = MyThread(soup, self._thread_count) # 创建线程对象 thread.start() # 开启线程 except Exception as e: self._error_log.write_error(e) # 写入错误日志 print("Error: 无法启动线程" + e)