def parse(self, response, **kwargs): li_list = response.xpath('//ul[@class="gl-warp clearfix"]/li') for li in li_list: item = JdItem() phone_desc = li.xpath( './/div[@class="p-name p-name-type-3"]/a/em/text()' ).extract_first().strip() phone_price = li.xpath( './/div[@class="p-price"]//i/text()').extract_first() phone_link = response.urljoin( li.xpath('.//div[@class="p-name p-name-type-3"]/a/@href'). extract_first()) from_phone = li.xpath( './/span[@class="J_im_icon"]/a/text()').extract_first() item['phone_desc'] = phone_desc item['phone_price'] = phone_price item['phone_link'] = phone_link item['from_phone'] = from_phone yield item # 翻页 page = int( re.findall('adv_param={page:"(.*?)",page_count:".*?"', response.body.decode())[0]) count_page = int( re.findall('adv_param={page:".*?",page_count:"(.*?)"', response.body.decode())[0]) if count_page > page: page = page + 1 next_url = self.start_urls[0] + f'&page={page}' print(next_url) yield scrapy.Request(url=next_url, callback=self.parse)
def parse(self, response): products = response.xpath('//*[@id="plist"]/ul/li[@class="gl-item"]') # soup = BeautifulSoup(response.body, "lxml") for p in products: item = JdItem() # imlist = soup.find_all(name="img",attrs={"width":"220","height":"220"}) # for im in imlist: # if 'src' in im.attrs: # imurl = "https:" + im.attrs['src'] # else: # imurl = "https:" + im.attrs['data-lazy-img'] # 首先判断有无src属性来决定如何获取 if p.xpath(".//div[@class='p-img']/a/img/@src").extract(): item['image'] = ''.join( ["https:"] + p.xpath(".//div[@class='p-img']/a/img/@src").extract()) else: item['image'] = ''.join(["https:"] + p.xpath( ".//div[@class='p-img']/a/img/@data-lazy-img").extract()) item['price'] = ''.join( p.xpath( ".//div[@class='p-price']/strong[@class='J_price']//text()" ).extract()).strip() item['title'] = ''.join( p.xpath( ".//div[@class='p-name']/a/em/text()").extract()).strip() yield item
def parse_book_list(self, response): temp = response.meta['temp'] book_list = response.xpath('//*[@id="J_goodsList"]/ul/li/div') for book in book_list: item = JdItem() item['big_category'] = temp['big_category'] item['big_category_link'] = temp['big_category_link'] item['small_category'] = temp['small_category'] item['small_category_link'] = temp['small_category_link'] item['bookname'] = book.xpath( './div[3]/a/em/text()|./div/div[2]/div[2]/div[3]/a/em/text()' ).extract_first().strip() item['author'] = book.xpath( './div[4]/span[1]/span/a/text()|./div/div[2]/div[2]/div[4]/span[1]/span[1]/a/text()' ).extract_first().strip() item['link'] = book.xpath( './div[1]/a/@href|./div/div[2]/div[2]/div[1]/a/@href' ).extract_first() # 获取图书编号 skuid = book.xpath('.//@data-sku').extract_first() # skuid = book.xpath('./@data-sku').extract_first() # print("skuid:",skuid) # 拼接图书价格低至 pri_url = 'https://p.3.cn/prices/mgets?skuIds=J_' + skuid yield scrapy.Request(url=pri_url, callback=self.parse_price, meta={'meta_1': item})
def next_parse(self, response): item = JdItem() try: item["url"] = response.url item["name"] = response.xpath( '//div[@class="p-info lh"]/div[@class="p-name"]/text()' ).extract() item["store"] = response.xpath( '//div[@class="name"]/a/text()').extract() pat = "//item.jd.com/(.*?).html" shop_id = re.compile(pat).findall(item["url"])[0] response1 = requests.get( "https://p.3.cn/prices/mgets?callback=jQuery7879290&type=1&area=1_72_2799_0&pdtk=&pduid=719435848&pdpin=&pin=null&pdbp=0&skuIds=J_{}%2CJ_19659646005%2CJ_42646006588%2CJ_4741808%2CJ_33239063849%2CJ_33341525798%2CJ_3494451%2CJ_797802%2CJ_37652171093&ext=11100000&source=item-pc" .format(shop_id)) data1 = response1.text price = data1[data1.index("["):data1.rindex("]") + 1] p = json.loads(price) item["price"] = p[0]["p"] response2 = requests.get( "https://club.jd.com/comment/productCommentSummaries.action?referenceIds={}&callback=jQuery2538049&_=1559982177443" .format(shop_id)) data2 = response2.text comment = data2[data2.find("["):data2.rfind("]") + 1] c = json.loads(comment) item["comment"] = c[0]["CommentCountStr"] item["good_comment"] = c[0]["GoodRateShow"] yield item except Exception as e: print(e)
def parse_news(self, response): t = JdItem() price = response.xpath( './/span[@class="p-price"]/span[2]/text()').extract_first() t['price'] = price info = response.xpath( './/div[@class="sku-name"]/text()').extract_first() t['info'] = info.strip() time = datetime.datetime.now() t['time'] = time try: if (round(float(price)) < round(float(self.wantprice))): emailSenderClient = emailSender() toSendEmailLst = [self.email] startTime = datetime.datetime.now() subject = "低价提醒" + info.strip() body = "细节:检测到有低于您设置的低价" emailSenderClient.sendEmail(toSendEmailLst, subject, body) # 发送邮件 except exception: pass yield t
def parse(self, response): li_list = response.xpath('//div[@id="J_goodsList"]/ul/li') for li in li_list: try: id = li.xpath('./div/div[6]/a/@data-sku').extract_first() title = li.xpath('./div/div[4]/a/em/text()').extract() link = li.xpath('./div/div[1]/a/@href').extract_first() price = li.xpath( './div/div[3]/strong/i/text()').extract_first() # comments = li.xpath('./div/div[5]/strong/a//text()').extract_first() shop_name = li.xpath( './div/div[7]/span/a/text()').extract_first() item = JdItem() item["id"] = id item["title"] = title item["link"] = link item["price"] = price # self.item["comments"] = comments item["shop_name"] = shop_name except: raise Exception("解析异常!!!") yield scrapy.Request(url=self.comments_urls + str(id), callback=self.getDetailpage, meta={"item": item})
def parse(self, response, **kwargs): data = self.get_data() for i in data: big_cate = i['categoryName'] s1 = int(i['fatherCategoryId']) s2 = int(i['categoryId']) big_cate_link = f'https://channel.jd.com/{s1}-{s2}.html' # print(big_cate,big_cate_link) small_list = i['sonList'] for small in small_list: item = JdItem() small_cate = small['categoryName'] s3 = int(small['categoryId']) s4 = int(small['fatherCategoryId']) small_cate_link = f'https://list.jd.com/list.html?cat={s1},{s4},{s3}' # print(small_cate,small_cate_link) item['big_cate'] = big_cate item['big_cate_link'] = big_cate_link item['small_cate'] = small_cate item['small_cate_link'] = small_cate_link yield scrapy.Request(url=item['small_cate_link'], meta={'item': item}, callback=self.parse_book_list)
def parse_book_list(self, response): temp = response.meta['py21'] book_list = response.xpath('//*[@id="J_goodsList"]/ul/li/div') data = "" for book in book_list: item = JdItem() item['big_category'] = temp["big_category"] item['big_category_link'] = temp["big_category_link"] item['small_category'] = temp["small_category"] item['small_category_link'] = temp["small_category_link"] item['bookname'] = book.xpath( './div[3]/a/em/text()').extract_first() item['author'] = book.xpath( './div[4]/span[1]/a/text()').extract_first() item['link'] = book.xpath('./div[1]/a/@href').extract_first() item['price'] = book.xpath( './div[2]/strong/i/text()').extract_first() yield item next_url = book.xpath('.//i[@class="promo-words"]/@id' ).extract_first().split('_')[-1] data += next_url + ',' cat = response.url.split('=')[-1].replace(",", "%2c") yield scrapy.Request( url= f'https://list.jd.com/listNew.php?cat={cat}&page=2&s=27&scrolling=y&tpl=2_M&isList=1&show_items={data[:-1]}', callback=self.parse_book_list_one, meta={"py21": temp}, headers={ "referer": "https://list.jd.com/list.html?cat=1713,3258,3297" })
def text(self, response): # 正文级别 item = JdItem() print("执行正文级别yield") a = 0 try: nevelone_title = response.xpath( '//td[@class="bav_border_top"][1]/a[3]//text()').extract_first( ) text_title = response.xpath( '//div[@id="cps_title"]//text()').extract_first() text_text = response.xpath( '//div[@id="cp_content"]//text()').extract() item['title'] = nevelone_title item['url'] = response.meta['url'] item['author'] = response.meta['author'] item['time'] = response.meta['time'] item['category'] = response.meta['category'] item['text_title'] = text_title item['text_text_all'] = item['title'] + ''.join(text_text).replace( '\u3000\u3000', '').strip('\r\n') print(item['title']) a += 1 print(a) yield item except Exception: pass
class Jdssspider(scrapy.Spider): name = 'jdsn' allowed_domains = ['www.jd.comd.com'] page = 3 s = 56 key = 'swith' nopass = False number = 2 item = JdItem() start_urls = [f'https://search.jd.com/Search?keyword={key}&wq={key}&page={page}&s={s}'] def parse(self, response): list_id = [] pieces = 0 print(response) print('——————————————————————开始爬取——————————————————————') print('——————————————————————开始输出——————————————————————') all = response.xpath('//div[@id="J_goodsList"]/ul/li') for i in all: id = i.xpath('./@data-sku').extract_first() url = 'https:' + i.xpath('.//div[@class="p-name p-name-type-2"]/a/@href').extract_first() title = i.xpath('.//div[@class="p-name p-name-type-2"]/a/em/text()').extract_first() shop = i.xpath('.//div[@class="p-shop"]/span/a/text()').extract_first() price = i.xpath('.//div[@class="p-price"]/strong/i/text()').extract_first() print(title, price) list_id.append(id) self.item['url'] = url self.item['title'] = title self.item['shop'] = shop self.item['price'] = price yield self.item pieces += 1 print(f'当前为第{pieces}') url = [f"https://search.jd.com/Search?keyword={self.key}&wq={self.key}&page={self.page}&s={self.s}", f'https://search.jd.com/s_new.php?keyword={self.key}&page={self.page - 1}&show_items={",".join(list_id)}'] headers = {"Referer": url} if self.nopass: self.nopass = False url = url[0] headers = None else: self.page += 2 self.s += 60 self.nopass = True url = url[1] print(f'当前为第{self.number}页') self.number += 1 if self.number < 20 and pieces == 30: yield scrapy.Request( url=url, callback=self.parse, headers=headers, dont_filter=True) else: return
def parse(self, response): wines = response.xpath( '//ul[@class="gl-warp clearfix"]/li/div[@class="gl-i-wrap"]') for wine in wines: item = JdItem() item['name'] = wine.xpath( './div[@class="p-name p-name-type-2"]/a[@target="_blank"]//em/text()' ).extract_first() print item['name'].strip() yield item
def parse(self, response): item = JdItem() url = response.xpath("//div[@class='p-img']/a/@href").extract() # print(url) for this_url in url: if this_url.startswith("https", 0, 5): yield Request(this_url, callback=self.next_parse) else: yield Request("https:" + this_url, callback=self.next_parse) for i in range(3, 200, 2): next_url = "https://search.jd.com/Search?keyword=%E7%94%B5%E8%84%91&enc=utf-8&page={}".format( i, i + 55) yield Request(next_url, callback=self.parse)
def parse_book_list(self, response): temp = response.meta['meta_1'] # print ('----',temp) # 获取书籍节点列表 book_list = response.xpath('//*[@id="plist"]/ul/li/div') # print (len(book_list)) # 遍历 for book in book_list: item = JdItem() # 提取图书信息 item['big_cate'] = temp['big_cate'] item['big_cate_link'] = temp['big_cate_link'] item['small_cate'] = temp['small_cate'] item['small_cate_link'] = temp['small_cate_link'] item['book_name'] = book.xpath( './div[3]/a/em/text()').extract_first() if item['book_name'] != None: item['book_name'] = item['book_name'].strip() item['cover_link'] = book.xpath( './div[1]/a/img/@src|./div[1]/a/img/@data-lazy-img' ).extract_first() item['detail_url'] = book.xpath('./div[1]/a/@href').extract_first() if item['detail_url'] != None: item['detail_url'] = 'https:' + item['detail_url'] item['authors'] = book.xpath( './div[4]/span[1]/span/a/text()').extract() item['publisher'] = book.xpath( './div[4]/span[2]/a/text()').extract_first() item['pub_time'] = book.xpath( './div[4]/span[3]/text()').extract_first() if item['pub_time'] != None: item['pub_time'] = item['pub_time'].strip() item['sku'] = book.xpath('./@data-sku').extract_first() # print (item) # 发起价格请求 if item['sku'] != None: url = 'https://p.3.cn/prices/mgets?skuIds=J_' + item['sku'] yield scrapy.Request(url, callback=self.parse_price, meta={'meta_2': item})
def parse_book_list(self, response): temp = response.meta['meta_1'] # print(temp['big_category']) # print(temp) # 获取所有图书节点 book_list = response.xpath('//*[@id="plist"]/ul/li/div') # print(len(book_list)) # 编列所有的图书节点列表 for book in book_list: # 构建item实例 item = JdItem() # 抽取数据 item['big_category'] = temp['big_category'] item['big_category_link'] = temp['big_category_link'] item['small_category'] = temp['small_category'] item['small_category_link'] = temp['small_category_link'] item['name'] = book.xpath('./div[3]/a/em/text()').extract_first() try: item['cover_link'] = 'https:' + book.xpath( './div[1]/a/img/@src').extract_first() except: item['cover_link'] = None try: item['detail_url'] = 'https:' + book.xpath( './div[1]/a/@href').extract_first() except: item['detail_url'] = None item['author'] = book.xpath( './div[4]/span[1]/span/a/text()').extract_first() item['publisher'] = book.xpath( '/div[4]/span[2]/a/text()').extract_first() item['pub_date'] = book.xpath( './div[4]/span[3]/text()').extract_first() # item['price'] = book.xpath('./div[4]/span[3]/text()').extract_first() # 构建价格请求 skuid = book.xpath('./@data-sku').extract_first() if skuid is not None: url = 'https://p.3.cn/prices/mgets?skuIds=J_' + skuid yield scrapy.Request(url, callback=self.parse_price, meta={'meta_2': item})
def parse(self, response): temp_list = response.xpath("//*[@id='J_goodsList']/ul/li") for temp in temp_list: item = JdItem() item['name'] = str( temp.xpath('div/div[3]/a/em/text()').extract()).replace( "['", "").replace("']", "") item['sales'] = temp.xpath( 'div/div[4]/strong/a/text()').extract()[0] item['price'] = temp.xpath( 'div/div[2]/strong/i/text()').extract()[0] yield item if self.i < 100: self.i = self.i + 2 url = self.url1 + str(self.i) + self.url2 yield Request(url, callback=self.parse)
def parse(self, response): # 图书大分类: dt_list = response.xpath('//*[@id="booksort"]/div[2]/dl/dt') # 遍历dt得到dd # following—sibling::*[1] for dt in dt_list: item = JdItem() item['big_name'] = dt.xpath('./a/text()').extract_first() # 小分类: em_list = dt.xpath('./following-sibling::*[1]/em') for em in em_list: item['small_name'] = em.xpath('a/text()').extract_first() small_link = 'http:' + em.xpath('a/@href').extract_first() #开启第二层 yield scrapy.Request(small_link, callback=self.parse_book, meta={"book": deepcopy(item)})
def parse(self, response): # print(response.text) itemList = response.xpath('//div[@class="search_prolist_item"]') subclass = response.xpath('//title/text()').extract()[0].split(' ')[0] for node in itemList[ 0:4]: # only the info of the first 4 items can be retrieved item = JdItem() # xpath returns a list # .// means any child node under current node item['name'] = node.xpath( './/div[@class="search_prolist_title"]/text()').extract( )[0].strip() item['img_url'] = node.xpath( './/div[@class="search_prolist_cover"]/img[@class="photo"]/@src' ).extract()[0] item['subclass'] = subclass item['item_id'] = node.xpath('./@skuid').extract()[0] yield item # certain subclass can not be retrieved, need to clean in pipeline
def extract_product_coupon(self, response): product = response.meta['product'] response_json = json.loads(str(response.body, encoding='utf8')) skuCoupon = response_json['skuCoupon'] skuPromote = response_json['prom']['pickOneTag'] skuCoupon_list = [] skuPromote_list = [] print(skuCoupon) for i in skuCoupon: counpon = {} # 配額 counpon['quota'] = i['quota'] # 減免 counpon['trueDiscount'] = i['trueDiscount'] # 限制 counpon['limit'] = i['name'] counpon['beginTime'] = i['beginTime'] counpon['endTime'] = i['endTime'] skuCoupon_list.append(counpon) for i in skuPromote: prom = {} prom['content'] = i['content'] prom['name'] = i['name'] try: prom['adurl'] = i['adurl'] except Exception: pass skuPromote_list.append(prom) product['jetso'] = {'product_coupon': skuCoupon_list, 'skuPromote': skuPromote_list} item = JdItem() item['sku'] = product['sku'] item['name'] = product['name'] item['detail'] = product['detail'] item['image'] = product['image'] item['other_type'] = product['other_type'] item['price'] = product['price'] item['p_type'] = product['p_type'] item['crawl_date'] = product['crawl_date'] item['sku_slave_typeid'] = product['sku_slave_typeid'] item['jetso'] = product['jetso'] print(product) yield item
def extract_product_price(self, response): """ 獲取商品價格 :param response: :return: """ product = response.meta['product'] price_json = json.loads(str(response.body, encoding='utf8')) ret_json = {} try: ret_json["old_price"] = price_json[0]["op"] except: pass ret_json["price"] = price_json[0]["p"] # 尝试获取是否有京东会员价格 if "tpp" in price_json[0].keys(): ret_json["vip"] = price_json[0]["tpp"] product['price'] = ret_json cumpon_url = 'https://cd.jd.com/promotion/v2?skuId=%s&area=19_1609_41655_0&cat=%s' % ( product['sku'], product['sku_slave_typeid']) print(cumpon_url) if product['sku_slave_typeid']: # cumpon_url = 'https://cd.jd.com/promotion/v2?skuId=%s&area=19_1609_41655_0&cat=%s' % (product['sku'], product['sku_slave_typeid']) yield scrapy.Request( cumpon_url, meta={'product': product}, callback=self.extract_product_coupon, dont_filter=True ) else: item = JdItem() item['sku'] = product['sku'] item['name'] = product['name'] item['detail'] = product['detail'] item['image'] = product['image'] item['other_type'] = product['other_type'] item['price'] = product['price'] item['p_type'] = product['p_type'] item['crawl_date'] = product['crawl_date'] item['sku_slave_typeid'] = product['sku_slave_typeid'] item['product_coupon'] = '' yield item
def parse(self, response): """ 对Selenium返回的页面进行解析 """ products = response.xpath("//*[@id='plist']/ul/li") for product in products: item = JdItem() item['title'] = product.xpath( ".//div/div[3]/a/em/text()").extract_first().strip() item['price'] = product.xpath( ".//div/div[2]/strong[1]/i/text()").extract_first() item['pic'] = product.xpath( ".//div/div[1]/a/img/@src").extract_first() item['comment'] = product.xpath( ".//div/div[4]/strong/a/text()").extract_first() item['store'] = product.xpath( ".//div/div[5]/span/a/text()").extract_first() yield item
def parse_book_list_one(self, response): temp = response.meta['py21'] book_list = response.xpath('//li[@class="gl-item"]') for book in book_list: item = JdItem() item['big_category'] = temp["big_category"] item['big_category_link'] = temp["big_category_link"] item['small_category'] = temp["small_category"] item['small_category_link'] = temp["small_category_link"] item['bookname'] = book.xpath( './/div[@class="p-name"]/a/em/text()').extract_first() item['author'] = book.xpath( './/div[@class="p-bookdetails"]/span[1]/a/text()' ).extract_first() item['link'] = book.xpath( './/div[@class="p-name"]/a/@href').extract_first() item['price'] = book.xpath( './/div[@class="p-price"]/strong/i/text()').extract_first() yield item
def parse(self, response): a = json.dumps(response.text) b = json.loads(a) comment = re.findall(r'{"productAttr":.*}', str(b)) # json解析 comm_dict = json.loads(comment[0]) # comments是评论实体 comm_list = comm_dict['comments'] for com in comm_list: item = JdItem() # 用户id item['id'] = com["id"] # 名称 item['name'] = com['referenceName'] # 用split去空格与换行 join连成整段评论 item['comment'] = ''.join(com['content'].split()) # 用户评分 item['score'] = com['score'] # 用户打分 # 时间 item['time'] = com['creationTime'] yield item
def parse_detail(self, response): data = response.request.meta['meta_1'] # 获取所有图书节点列表 books = response.xpath('//*[@id="plist"]/ul/li/div') # 创建item对象 item = JdItem() # 遍历节点 for book in books: item['big_category'] = data['big_category'] item['big_category_link'] = data['big_category_link'] item['small_category'] = data['small_category'] item['small_category_link'] = data['small_category_link'] item['book_name'] = book.xpath( './div[3]/a/em/text()').extract_first() item['book_cover'] = book.xpath( './div[1]/a/img/@src|./div[1]/a/img/@data-lazy-img' ).extract_first() try: item['detail_link'] = 'https:' + book.xpath( './div[1]/a/@href').extract_first() except: item['detail_link'] = None item['author'] = book.xpath( './div[4]/span[1]/span/a/text()').extract_first() item['publisher'] = book.xpath( './div[4]/span[2]/a/text()').extract_first() item['pub_data'] = book.xpath( './div[4]/span[3]/text()').extract_first() # item['price'] = book.xpath('./div[2]/strong[1]/i/text()').extract_first() # yield item skuid = book.xpath('./@data-sku').extract_first() if skuid != None: # 拼接价格url price_url = 'https://p.3.cn/prices/mgets?skuIds=J_' + str( skuid) yield scrapy.Request(url=price_url, callback=self.parse_price, meta={'meta_2': item})
def parse(self, response): name = response.xpath( '//div[@id="name"]/h1/text()').extract_first().replace(' ', '') data = response.xpath('//div[@id="p-author"]') author = data.xpath('string(.)').extract_first().replace('\n', '').replace( ' ', '') data = response.xpath('//ul[@id="parameter2"]') detail = data.xpath('string(.)').extract_first()[1:-1].replace(' ', '') shop = '京东自营' if detail[:3] == '出版社' else response.xpath( '//ul[@id="parameter2"]/li[1]/a/text()').extract_first() book_id = response.xpath('//head/link/@href').extract_first().split( '/')[-1].split('.')[0] this = 'http:' + response.xpath( '//head/link/@href').extract_first() #本页的url #因为价格与评论信息是发起JS请求,从响应中提取JSON数据,所以不再利用scrapy内置功能。自己Requests jsurl = 'http://p.3.cn/prices/get?type=1&area=1_72_2799&ext=11000000&pin=&' \ 'pdtk=FPccakpV9mj2W7jFSF%2BtATks2rbgJDLiwIUI5nkedHiAWTgr9wJVrXOToICN%2B93%2B&' \ 'pduid=1506124005948838590885&pdpin=&pdbp=0&skuid=J_{}&callback=cnp'.format(book_id) #pdtk是 Cookie的最后一段。想办法动态获得。 str = self.getJsonFrom(jsurl).text[5:-4] #去掉前后部分才是JSON格式 price = json.loads(str) #获取json数据。 cmturl = 'http://club.jd.com/comment/productCommentSummaries.action?referenceIds={}'.format( book_id) info = self.getJsonFrom(cmturl).json() Item = JdItem( book_dict={ '书名': name, '作者': author, '店铺': shop, '现价': float(price['p']), '定价': float(price['m']), '平常价': float(price['op']), '评论数': int(info['CommentsCount'][0]['CommentCount']), '好评率': float(info['CommentsCount'][0]['GoodRate']), '详情': detail, '链接': this }) yield Item
def parse_booklist(self, response): temp = response.meta page = int(temp.get("page")) total_page = int( response.xpath( "//span[@class='p-skip']/em/b/text()").get().strip()) book_list = response.xpath("//div[@id='plist']/ul/li/div") for book in book_list: item = JdItem() item["big_category"] = temp["big_category"] item["big_category_link"] = temp["big_category_link"] item["small_category"] = temp["small_category"] item["small_category_link"] = temp["small_category_link"] item["bookname"] = book.xpath( ".//div[@class='p-name']/a/em/text()").get().strip() item["link"] = response.urljoin( book.xpath("./div[@class='p-name']/a/@href").get()) item["author"] = book.xpath( "./div[@class='p-bookdetails']/span/span/a/text()").get() # 获取图书编号,拼接图书价格地址 sku_id = book.xpath(".//@data-sku").get() price_url = "https://p.3.cn/prices/mgets?skuIds=J_" + sku_id yield scrapy.Request(url=price_url, callback=self.parse_price, meta={"meta_1": item}) if page < total_page: page += 1 small_category_link = temp[ "small_category_link"] + "&page={}".format(page) temp["page"] = page yield scrapy.Request(small_category_link, callback=self.parse_booklist, meta=temp)
def parse_book_list(self, response): """解析图书小分类下面的书籍列表""" # 获取parse方法传递的meta数据,传递temo的作用是为了构建模型的时候传递oarse中获得的大小分类的name和url temp = response.meta['meta1'] book_list = response.xpath('//*[@id="plist"]/ul/li/div') # 遍历图书列表 for book in book_list: # 实例化item item = JdItem() # 书名信息、分类信息,出版社只有在鼠标滑过的时候才会显示 item['name'] = book.xpath( './div[3]/a/em/text()').extract_first().strip() item['big_category'] = temp['big_category'] item['big_category_url'] = temp['big_category_url'] item['small_category'] = temp['small_category'] item['small_category_url'] = temp['small_category_url'] item['author'] = book.xpath( './div[@class="p-bookdetails"]/span[@class="p-bi-name"]/span[@class="author_type_1"]/a/text()' ).extract_first() item['publisher'] = book.xpath( './div[@class="p-bookdetails"]/span[2]/a/text()' ).extract_first() item['pub_date'] = book.xpath( './div[@class="p-bookdetails"]/span[3]/text()').extract_first( ).strip() try: item['cover_url'] = 'https:' + book.xpath( './div[1]/a/img/@src').extract_first() except: item['cover_url'] = None try: item['detail_url'] = 'https:' + book.xpath( './div[3]/a/@href').extract_first() except: item['detail_url'] = None # 获取价格的url,价格的保存并不在html中而是在jquery进行请求一个p.3.cn的接口, # 并且需要skuid等等参数,发现参数的值藏在html页面中,传入参数来获取当页价格的json list # https://p.3.cn/prices/mgets?skuIds=J_11757834%2CJ_10367073%2CJ_11711801%2CJ_12090377%2 # CJ_10199768%2CJ_11711801%2CJ_12018031%2CJ_10019917%2CJ_11711801%2CJ_10162899%2CJ_110816 # 95%2CJ_12114139%2CJ_12010088%2CJ_12161302%2CJ_11779454%2CJ_11939717%2CJ_12026957%2CJ_12 # 184621%2CJ_12115244%2CJ_11930113%2CJ_10937943%2CJ_12192773%2CJ_12073030%2CJ_12098764%2CJ # _11138599%2CJ_11165561%2CJ_11920855%2CJ_11682924%2CJ_11682923%2CJ_11892139&pduid=1523432 # 585886562677791 # 获得skuid=11757834的价格链接,pduid是固定的 # https://p.3.cn/prices/mgets?skuIds=J_11757834&pduid=1523432585886562677791 skuid = book.xpath('./@data-sku').extract_first() pduid = '&pduid=1523432585886562677791' # print(item) # 再次发送请求,获取价格信息 if skuid is not None: # 如果打印不出价格,是因为获取价格的时候惊醒了跨域请求的域名已经发生了改变 url = 'https://p.3.cn/prices/mgets?skuIds=J_' + skuid + pduid yield scrapy.Request(url=url, callback=self.parse_price, meta={'meta2': item})