def book(shopId, pageNo): html_soup = soup('http://shop.kongfz.com/' + str(shopId) + '/all/0_50_0_0_' + str(pageNo) + '_sort_desc_0_0/') items = html_soup.select('.list-content > div') books = [] for item in items[0:50]: books.append(str(item['itemid']) + ':' + str(item['isbn'])) return books
def contents(web_url): # 爬虫获取网页没啥好说的 html_soup = soup(web_url) tags = html_soup.select('.tagCol > tbody > tr > td > a') temps = [] for tag in tags: temp = tag.get_text() if len(temp) > 0: temps.append(temp) return temps
def get_book(bookUrl): # 爬虫获取网页没啥好说的 book_soup = soup(bookUrl) # 内容 content = book_soup.find_all(id='info')[0] # 标题 title = book_soup.find_all(property='v:itemreviewed')[0].get_text() # 评分 score = "0" scores = book_soup.find_all(property='v:average') if len(scores) > 0: score = replaces(str(scores[0].get_text())) if len(score) == 0: score = "0" # 评价人数 comments = 0 comment = book_soup.find_all(property='v:votes') if len(comment) > 0: comments = comment[0].get_text() if len(comments) == 0: comments = 0 # 封面 cover_image = book_soup.find_all(rel='v:photo')[0]['src'] # 去除空格,在用换行符分割成list temps = replaces(content.get_text()).split('\n') info = [] for temps_index in range(len(temps)): if temps_index > 0: if temps[temps_index].find(':') == -1: info[-1] = info[-1] + temps[temps_index] else: info.append(temps[temps_index]) else: info.append(temps[temps_index]) params = [ Id(bookUrl), title, cover_image, score, comments, '', '', '', '', '', 0, 0, '', local(), local() ] for i in info: if i.find('作者') > -1: author = replaces(i.split(':')[1]) params[5] = country(author)[0] params[6] = country(author)[1].replace("'", "`") if i.find('出版社') > -1: params[7] = replaces(i.split(':')[1]).replace("'", "`") if i.find('译者') > -1: params[8] = replaces(i.split(':')[1]) if i.find('出版年') > -1: params[9] = replaces(i.split(':')[1]) if i.find('定价') > -1: params[10] = re.sub('[^0-9.]', '', i.split(':')[1]) if i.find('页数') > -1: pageSize = re.sub('[^0-9]', '', i.split(':')[1]) if pageSize == '': pageSize = '0' params[11] = pageSize if i.find('ISBN') > -1: params[12] = replaces(i.split(':')[1]) print(params) save(params)
def books(web_url): target_soup = soup(web_url) book_list = [] for item in target_soup.select('.nbg'): book_list.append(item['href']) return book_list