def get_book_details(self, url, book_id, book_capter_numb): html = self.get_html(url) soup = BeautifulSoup(html, 'lxml') book_capter_name = soup.select( '#wrapper .content_read .box_con .bookname h1')[0].text # print(book_capter_name) book_content = soup.select('#wrapper .content_read #content')[0].text # print(book_content) next_url = soup.select( '#wrapper .content_read .box_con .bookname .bottem1 a' )[2].attrs['href'] # print(next_url) data_for_book_details = { BOOK_ID: book_id, DETAILS_NEXT_URL: next_url, DETAILS_CAPTER_NUMB: book_capter_numb, DETAILS_CAPTER_NAME: book_capter_name, DETAILS_CONTENT: book_content } #todo 将数据写入数据库 # print('----------------------------开始插入book_details--------------------------------------') spidermongo = SpiderMongo() spidermongo.insert_data_dabases(BOOK_DETAILS_COLLECTION, data_for_book_details) # print('----------------------------结束插入book_details--------------------------------------') return next_url
def get_newest_capter_numb_from_mongodb(self, book_id): s = SpiderMongo() res = s.get_newest_capter_numb_from_mongodb(book_id) if len(res) == 0: return None else: return res[0]['book_capter_numb']
def update_book_infos_by_book_id(self, url, book_class, book_id): ''' 返回值: 最新的url地址 ''' book_id = self.get_id_from_url(url) html = self.get_html(url) soup = BeautifulSoup(html, 'lxml') book_st = soup.select('#wrapper .box_con #maininfo #info p')[1].text status = book_st.split(',') book_status = status[0] # print(book_status) # print("---------------------------------------------------------------------------------") book_last_updata_time = soup.select( '#wrapper .box_con #maininfo #info p')[2].text book_last_updata_desc = soup.select( '#wrapper .box_con #maininfo #info p')[3].text book_last_updata_url = soup.select( '#wrapper .box_con #maininfo #info p a')[2].attrs['href'] data_for_book_infos = { INFOS_STATUS: book_status, INFOS_LAST_UPDATE_TIME: book_last_updata_time, INFOS_LAST_UPDATE_DESC: book_last_updata_desc, INFOS_LAST_UPDATE_URL: book_last_updata_url } s = SpiderMongo() res = s.update_book_infos_by_book_id(book_id, data_for_book_infos)
def get_book_infos(self, url, book_class, book_id): html = self.get_html(url) soup = BeautifulSoup(html, 'lxml') book_name = soup.select('#wrapper .box_con #maininfo #info h1')[0].text # print(book_name) book_auhter = soup.select('#wrapper .box_con #maininfo #info p')[0].text # print(book_auhter) # print("---------------------------------------------------------------------------------") book_st = soup.select('#wrapper .box_con #maininfo #info p')[1].text status = book_st.split(',') book_status = status[0] # print(book_status) # print("---------------------------------------------------------------------------------") book_last_updata_time = soup.select('#wrapper .box_con #maininfo #info p')[2].text # print(book_last_updata_time) book_last_updata_desc = soup.select('#wrapper .box_con #maininfo #info p')[3].text # print(book_last_updata_desc) book_last_updata_url = soup.select('#wrapper .box_con #maininfo #info p a')[2].attrs['href'] img_url = soup.select('#wrapper .box_con #sidebar #fmimg img')[0].attrs['src'] book_desc = soup.select('#wrapper .box_con #maininfo #intro')[0].text # print(book_last_updata_url) # book_class = book_class # book_id = book_id data_for_book_infos = { BOOK_ID: int(book_id), INFOS_CLASS: book_class, INFOS_NAME: book_name, INFOS_AUTHER: book_auhter, INFOS_IMG_URL: img_url, INFOS_STATUS: book_status, INFOS_LAST_UPDATE_TIME: book_last_updata_time, INFOS_LAST_UPDATE_DESC: book_last_updata_desc, INFOS_LAST_UPDATE_URL: book_last_updata_url, INFOS_DECS: book_desc } # print("-----------------------------下载图片开始---------------------") # todo 写入集合即表:book_infos中 filepath = BOOK_IMG_DIR + '/' + book_id + '.jpg' # print(filepath) # print(data_for_book_infos[INFOS_IMG_URL]) try: urllib.request.urlretrieve(data_for_book_infos[INFOS_IMG_URL], filename=filepath) new_img_url = filepath except Exception as e: print("Error occurred when downloading file, error message:") print(e) new_img_url = None print(new_img_url) data_for_book_infos[INFOS_IMG_URL] = new_img_url # print('-------------------------------开始插入-----------------------------------') # print(BOOK_INFOS) # print(data_for_book_infos) spidermongo = SpiderMongo() spidermongo.insert_data_dabases(BOOK_INFOS_COLLECTION, data_for_book_infos) # print('--------------------------------结束插入----------------------------------') # 返回第一页地址 first_url = soup.select('#wrapper .box_con #list dl dd a')[0].attrs['href'] return first_url, book_last_updata_url
def is_book_id_in_mongodb(self, book_id): ''' 参数: book_id: 图书的ID号 返回: 如果id存在,返回新一章的url地址 如果id不存在,返回None ''' searchid = SpiderMongo() result = searchid.is_book_id_in_mongodb(book_id) if result == None: return result else: return result['book_last_updata_url']