def parse(self, response): goods = response.css('ul li.gl-item') #self.logger.debug(goods.extract()) item = JingdongItem() for good in goods: item['keyword'] = response.meta['keyword'] item['id'] = good.xpath('@data-sku').extract_first() item['price'] = good.xpath( 'div[@class = "gl-i-wrap"]/div[@class = "p-price"]/strong/i/text()' ).extract_first() item['title'] = good.xpath( 'div[@class = "gl-i-wrap"]/div[@class = "p-name p-name-type-2"]/a/em/text()' ).extract_first() item['promo_words'] = good.xpath( 'div[@class = "gl-i-wrap"]/div[@class = "p-name p-name-type-2"]/a/i/text()' ).extract_first() item['href'] = good.xpath( 'div[@class = "gl-i-wrap"]/div[@class = "p-img"]/a/@href' ).extract_first() item['comment_num'] = good.xpath( 'div[@class = "gl-i-wrap"]/div[@class = "p-commit"]/strong/a/text()' ).extract_first() item['shop'] = good.xpath( 'div[@class = "gl-i-wrap"]/div[@class = "p-shop"]/span/a/text()' ).extract_first() item['img'] = good.xpath( 'div[@class = "gl-i-wrap"]/div[@class = "p-img"]/a/img/@source-data-lazy-img' ).extract_first() yield item pass
def parse(self, response): print(response.text) good_id = re.findall('/(\d+).html', response.url)[0] print('进入详情页的id:', good_id) #商品名称,有的第一个为空 filename = response.xpath( '//div[contains(@class,"itemInfo-wrap")]/div[contains(@class,"sku-name")]/text()' ).extract()[-1].strip() if '/' in filename: filename = filename.replace('/', '-') if ':' in filename: filename = filename.replace(':', '') if '?' in filename: filename = filename.replace('?', '') if '|' in filename: filename = filename.replace('|', '_') intruduce = response.xpath( '//div[@id="detail"]//div[contains(@class,"p-parameter")]/ul[contains(@class,"parameter2")]/li/text()' ).extract() intruduce = self.deal_intruduce(intruduce) pic_ids = response.xpath( '//div[@id="choose-attr-1"]/div[contains(@class,"dd")]/div/@data-sku' ).extract() pic_names = response.xpath( '//div[@id="choose-attr-1"]/div[contains(@class,"dd")]/div/a/img/@alt' ).extract() if pic_ids: for n, value in enumerate(pic_ids): url = "https://item.jd.com/" + pic_ids[n] + ".html" print(pic_ids[n], pic_names[n]) yield scrapy.Request(url, callback=self.detailpage, meta={ 'filename': filename, 'intruduce': intruduce, 'name': pic_names[n], 'good_id': pic_ids[n] }, dont_filter=True) else: item = JingdongItem() item['filename'] = filename item['intruduce'] = self.deal_intruduce(intruduce) img_urls1 = response.xpath( '//div[@id="spec-list"]/ul/li/img/@src').extract() item['img_urls'] = self.deal_img(img_urls1) # 每种款式的所有图片 item['good_id'] = good_id item['img_name'] = item['filename'][-10:] yield item #评论部分 url = "https://club.jd.com/discussion/getProductPageImageCommentList.action?productId=" + good_id + "&isShadowSku=0&page=1&pageSize=10&_=" + str( time.time() * 1000)[:-4] yield scrapy.Request(url, callback=self.pinglun, meta={ 'filename': filename, 'good_id': good_id, }, dont_filter=True)
def parse(self, response): item = JingdongItem() id = response.xpath( '//div[@class="gl-i-wrap j-sku-item"]/@data-sku').extract() item['name'] = response.xpath( '//a[@target="_blank"][@title=""]/em/text()').extract() item['shopname'] = response.xpath( '//div[@class="p-shop"]/@data-shop_name').extract() '''for item_id in (id): url1="https://p.3.cn/prices/mgets?callback=jQuery6324226&type=1&area=1_72_4137_0&skuIds=J_"+item_id url2 = "http://club.jd.com/comment/productCommentSummaries.action?my=pinglun2&referenceIds=" + item_id pat1='"p":"(.*?)"' pat2='"CommentCount":(.*?),"' pat3='"GoodRate":(.*?),"' price=re.compile(pat1).findall(urllib.request.urlopen(url1).read().decode("GBK","ignore")) comment=re.compile(pat2).findall(urllib.request.urlopen(url2).read().decode("GBK","ignore")) rate=re.compile(pat3).findall(urllib.request.urlopen(url2).read().decode("GBK","ignore")) url="//item.jd.com/"+item_id+".html" price_list=[] comment_list=[] rate_list=[] url_list=[] price_list.append(price) comment_list.append(comment) rate_list.append(rate) url_list.append(url) print(price_list)''' print(item) return item
def next(self, response): item = JingdongItem() #html = urllib2.urlopen(response.url).read() #tree = lxml.html.fromstring(html) #item["title"] = tree.cssselect("[@class='tb-main-title']/@data-title") item["title"] = response.xpath('//div[@class="sku-name"]/text()').extract()[0].encode('utf-8').strip() item["link"] = response.url item['shop'] = response.xpath('//div[@class="name"]/a/text()').extract()[0].encode('utf-8').strip() item['shopLink'] = 'https:' + response.xpath('//div[@class="name"]/a/@href').extract()[0] item['compositeScore'] = response.xpath('//em[@class="evaluate-grade"]/span/a/text()').extract()[0] tdata = response.url.split('/') skuids = tdata[3][:-5] purl = 'https://p.3.cn/prices/mgets?skuIds=J_' + skuids pricedata = urllib2.urlopen(purl).read() jdata = json.loads(pricedata) item["price"] = jdata[0]["p"] commenturl = "http://club.jd.com/comment/productPageComments.action?productId={}&score=0&sortType=5&page=0&pageSize=10".format(skuids) commentdata = urllib2.urlopen(commenturl).read().decode("GBK", "ignore") tempdata = re.findall('("content":".*?),"isTop"', commentdata) item['commentdata'] = "" for data in tempdata: item['commentdata'] += data.encode('utf-8') print item["title"] print item["link"] print item["shop"] print item["shopLink"] print item["compositeScore"] print item["price"] print item["commentdata"] yield item
def next_parse(self, response): all_goods = response.xpath('/html/body/li') for one_good in all_goods: item = JingdongItem() try: data = one_good.xpath('div/div/a/em') item['title'] = data.xpath('string(.)').extract()[ 0] # 提取出该标签所有文字内容 item['comment_count'] = one_good.xpath( 'div/div[@class="p-commit"]/strong/a/text()').extract()[ 0] # 评论数 item['goods_url'] = 'http:' + one_good.xpath( 'div/div[4]/a/@href').extract()[0] # 商品链接 item['shops_id'] = one_good.xpath( 'div/div[@class="p-shop"]/@data-shopid').extract()[ 0] # 店铺ID item['shop_url'] = self.shop_url.format( shop_id=item['shops_id']) goods_id = one_good.xpath( 'div/div[2]/div/ul/li[1]/a/img/@data-sku').extract()[0] if goods_id: item['goods_id'] = goods_id price = one_good.xpath( 'div/div[3]/strong/i/text()').extract() # 价格 if price: # 有写商品评论数是0,价格也不再源代码里面,应该是暂时上首页的促销商品,每页有三四件,我们忽略掉 item['price'] = price[0] yield item # print(item) except Exception as e: pass
def parse(self, response): pids = [] for li in response.xpath('//*[@id="J_goodsList"]/ul/li/div'): item = JingdongItem() price = li.xpath('div[3]/strong/i/text()').extract() store = li.xpath('div[7]/span/a/text()').extract() url = li.xpath( 'div[@class="p-name p-name-type-2"]/a/@href').extract() pid = li.xpath('@data-pid').extract() pids.append(''.join(pid)) item['pid'] = ''.join(pid) # item['title'] = ''.join(title) item['price'] = ''.join(price) item['store'] = ''.join(store) item['url'] = ''.join(url) if item['url'].startswith('//'): item['url'] = 'https:' + item['url'] elif not item['url'].startswith('https:'): item['info'] = None yield item continue yield scrapy.Request(self.comment_url % ''.join(pid), callback=self.comment_parse, meta={"item": item}) headers = {'referer': response.url} yield scrapy.Request(self.next_url % (self.key_word, self.page, ','.join(pids)), callback=self.next_parse, headers=headers)
def getitemInfo(self, response): item = JingdongItem() init_priceUrl = 'https://p.3.cn/prices/mgets?skuIds=J_{}' itemid = re.findall('(\d+)', response.url)[0] try: item['title'] = "".join( response.xpath( '//div[@class="sku-name"]/text()').extract()).replace( ' ', '').replace('\r', '').replace('\n', '') item['goods_url'] = response.url item['goods_id'] = itemid item['itemDetail'] = ";".join( response.xpath( '//div[@class="p-parameter"]/ul[@class="parameter2 p-parameter-list"]/li/text()' ).extract()) # yield item yield scrapy.Request(url=init_priceUrl.format(itemid), meta={'item': item}, callback=self.getitemPrice, dont_filter=True) except Exception as e: print('没有基础数据')
def next_parse(self, response): for li in response.xpath('//li[@class="gl-item"]'): item = JingdongItem() # title = li.xpath('div/div/a/em/text()').extract() price = li.xpath('div/div/strong/i/text()').extract() store = li.xpath('div/div/span/a/text()').extract() url = li.xpath( 'div/div[@class="p-name p-name-type-2"]/a/@href').extract() pid = li.xpath('@data-pid').extract() item['pid'] = ''.join(pid) item['price'] = ''.join(price) item['store'] = ''.join(store) item['url'] = ''.join(url) if item['url'].startswith('//'): item['url'] = 'https:' + item['url'] elif not item['url'].startswith('https:'): item['info'] = None yield item continue yield scrapy.Request(self.comment_url % ''.join(pid), callback=self.comment_parse, meta={"item": item}) if self.page < 200: self.page += 2 yield scrapy.Request(self.url % (self.key_word, self.key_word, self.page), callback=self.parse)
def parse(self, response): all_goods = response.xpath('//div[@id="J_goodsList"]/ul/li') for one_good in all_goods: item = JingdongItem() try: data = one_good.xpath('div/div/a/em') item['title'] = data.xpath('string(.)').extract()[ 0] #提取出该标签所有文字内容 item['comment_count'] = one_good.xpath( 'div/div[@class="p-commit"]/strong/a/text()').extract()[ 0] #评论数 item['goods_url'] = 'http:' + one_good.xpath( 'div/div[4]/a/@href').extract()[0] #商品链接 item['shop_url'] = 'http:' + one_good.xpath( 'div/div[7]/span/a/@href').extract()[0] #店铺链接 item['shops_id'] = self.find_shop_id(item['shop_url']) #店铺ID goods_id = one_good.xpath( 'div/div[2]/div/ul/li[1]/a/img/@data-sku').extract()[0] if goods_id: item['goods_id'] = goods_id price = one_good.xpath( 'div/div[3]/strong/i/text()').extract() #价格 if price: #有写商品评论数是0,价格也不再源代码里面,应该是暂时上首页的促销商品,每页有三四件,我们忽略掉 item['price'] = float(price[0]) yield item except Exception as e: pass
def parse(self, response): item=JingdongItem() listdata=response.body.decode('utf-8', 'ignore') # 频道1/2 pd=response.xpath('//span[@class="curr"]/text()').extract() if len(pd)==0: pd=['缺省', '缺省'] if len(pd)==1: pda=pd[0] pd=[pda, '缺省'] pd1=pd[0] pd2=pd[1] # 图书名 bookname=response.xpath('//div[@class="p-name"]/a/em/text()').extract() #print(bookname) # 价格 allskupat='<a data-sku="(.*?)"' allsku=re.compile(allskupat).findall(listdata) #print(allsku) # 评论数 # 作者的信息 author=response.xpath('//span[@class="author_type_1"]/a/@title').extract() # 出版社的信息 pub=response.xpath('//span[@class="p-bi-store"]/a/@title').extract() # 店家 seller=response.xpath('//span[@class="curr-shop"]/text()').extract() # 处理当前页的数据 for n in range(0, len(seller)): name=bookname[n+3] thissku=allsku[n] priceurl='https://p.3.cn/prices/mgets?callback=jQuery7839616&type=1&skuIds=J_'+str(thissku) pricedata=urllib.request.urlopen(priceurl).read().decode('utf-8', 'ignore') pricepat='"p":"(.*?)"' price=re.compile(pricepat).findall(pricedata)[0] commenturl = 'https://club.jd.com/comment/productCommentSummaries.action?my=pinglun&referenceIds=' + str(thissku)+'&callback=jQuery8841347' commentdata = urllib.request.urlopen(commenturl).read().decode('utf-8', 'ignore') commentpat = '"CommentCount":(.*?),' commentnum = re.compile(commentpat).findall(commentdata)[0] thisauthor=author[n] thispub=pub[n] thisseller=seller[n] print(pd1) print(pd2) print(name) print(price) print(commentnum) print(thisauthor) print(thispub) print(thisseller) item['channel1']=pd1 item['channel2']=pd2 item['name']=bookname item['comment_num']=commentnum item['author'] = thisauthor item['pub'] = thispub item['seller'] = thisseller yield item
def coupon_detail(self, response): next_page = response.xpath( '/html/body/div[5]/div/div[2]/div[2]/div/div[2]/div/a[9]//@href' ).extract() for sel in response.xpath('//*[@id="coupons-list"]'): name = sel.xpath('//div[1]/div[2]/div[1]/p//@title').extract() item = JingdongItem() item['name'] = name yield item
def parse(self, response): # css解析还是要多看看 lilist = response.css('div#plist ul li') for li in lilist: item = JingdongItem() item['name'] = li.css('div.p-name a em::text').extract_first() item['price'] = li.css( 'div.p-price strong i::text').extract_first() item['shop'] = li.css('div.p-shop a::text').extract_first() yield item
def parse(self, response): # 用来存放所有的pid, 拼接ajax时需要使用 pid_list = [] i = 1 phone_list = response.xpath('//li[@data-sku]') # 最终页页码 end_page = response.xpath( "//span[@class='p-skip']//b/text()").extract() for item in phone_list: jd = JingdongItem() # 获取item内容 # PID jd['pid'] = item.xpath('./@data-pid').extract_first() # 商品图片链接 jd['image_link'] = self._get_phone_image(item) # 价格 jd['price'] = item.xpath( ".//div[@class='p-price']//i/text()").extract_first() # 名称 jd['title'] = item.xpath( ".//div[@class='p-name p-name-type-2']//em/text()").extract() # 评论数 jd['comment_num'] = item.xpath( ".//div[@class='p-commit']/strong/a/text()").extract_first() # 商铺名称(存在店铺为None, 这些商品为广告) jd['shop_name'] = item.xpath( ".//div[@class='p-shop']/span/a/text()").extract_first() # 商铺链接 jd['shop_link'] = item.xpath( ".//div[@class='p-shop']/span/a/@href").extract_first() print '*' * 30, jd['shop_link'] # 二手链接(存在返回链接, 否则返回None) jd['second_link'] = item.xpath( ".//div[@class='p-commit']/a/@href").extract_first() # 广告 jd['ad'] = item.xpath( './/span[@class="p-promo-flag"]/text()').extract_first() # 拼接前30pid,用来ajax请求发送 pid_list.append(jd['pid']) self.show_items = ','.join(pid_list)[:-1] logger.info(i) yield jd i += 1 # 当拼接数为30时, 直接请求当前页剩余30条信息 if len(pid_list) == 30: self.search_page = self.page + 1 yield scrapy.Request(self.next_url.format( self.search_page, self.show_items), callback=self.parse_other_info, meta={'end_page': end_page})
def parse(self, response): li_list = response.selector.xpath('//div[@id="J_goodsList"]/ul//li') for l_list in li_list: goods = JingdongItem() img_src = ''.join( l_list.xpath( './/div[@class="p-img"]/a/img/@src').extract_first()) price = ''.join( l_list.xpath( './/div[@class="p-price"]//i/text()').extract_first()) goods['img_src'] = img_src goods['price'] = price yield goods
def parsefirstPage(self, response): infos = response.xpath('//li[@class="gl-item"]/div/div[@class="p-img"]/a') for info in infos: item = JingdongItem() url = info.xpath('@href').extract() goods_link = response.urljoin(url[0]) item['link'] = goods_link # 商品链接 for link in url: url = response.urljoin(link) yield Request(url, meta={'meta': item}, callback=self.parsegoods) if self.page<200: self.page +=2 #翻页 yield scrapy.Request(self.url % (self.page), callback=self.parsefirstPage)
def parse_item(self, response): #item['domain_id'] = response.xpath('//input[@id="sid"]/@value').get() #item['name'] = response.xpath('//div[@id="name"]').get() #item['description'] = response.xpath('//div[@id="description"]').get() try: i = JingdongItem() this_url = response.url #获取当前爬取的页面 form = "item.jd.com/(.*?).html" #正则表达式 x = re.search(form, this_url) #判断能够找到网页 if (x): goods_id = re.compile(form).findall(this_url)[0] title = response.xpath( "//div[@class='sku-name']/text()").extract() shop = response.xpath( "//div[@class='usertopleft']/h2/text()").extract() shop_link = response.xpath( "//div[@class='usertopleft']/a[@target='_blank']/@href" ).extract() #抓包分析商品价格的网址 price_url = "https://p.3.cn/prices/mgets?callback=jQuery1888176&type=1&area=19_1601_3633_0&pdtk=&pduid=323942232&pdpin=&pin=null&pdbp=0&skuIds=J_" + str( goods_id) + "&ext=11100000&source=item-pc" #讲网址信息读取出来 price_data = urllib.request.urlopen(price_url).read().decode( "utf-8") #价格的正则 price_form = '"p":"(.*?)"' #提取出来的价格 price = re.compile(price_form).findall(price_data) if (len(title) and len(shop) and len(shop_link) and len(price)): print(title[0]) print(shop[0]) print(shop_link[0]) print(price[0]) print("………………") else: pass else: pass return i except Exception as e: print(e)
def parse_item(self,response): sel = Selector(response) filename = response.url.split("/")[-1] item = JingdongItem() item["url"] = [response.url] item["name"] = sel.xpath('//*[@id="name"]/h1/text()').extract() # js生成的价格。。。 # item["price"] = sel.xpath('//div[2]/div[2]/strong/text()').extract() # 参考 网文 http://blog.csdn.net/lanshanlei/article/details/42741179 productid = os.path.splitext(filename)[-2] #response.url[19:29] priceUrl = 'http://p.3.cn/prices/mgets?skuIds=J_' + productid + 'J_' r = Request(priceUrl,callback= self.parsePrice) r.meta['item'] = item yield r
def parse(self, response): dt_list = response.xpath("//div[@class='mc']/dl/dt") for dt in dt_list: # item ={} item = JingdongItem() item["b_cate"] = dt.xpath("./a/text()").extract_first() dd_list = dt.xpath("./following-sibling::dd[1]/em") for dd in dd_list: item["s_cate"] = dd.xpath("./a/text()").extract_first() item["s_href"] = dd.xpath("./a/@href").extract_first() if item["s_href"] is not None: item["s_href"] = "https:" + item["s_href"] yield scrapy.Request(item["s_href"], callback=self.book_detail_list, meta={"item": deepcopy(item)})
def parsebody(self, response): t = re.findall('^fetchJSON_comment98vv\d*\((.*)\);', response.text) # 去掉json的头信息,变成一个单一的列表 json_data = json.loads(t[0]) for comment in json_data['comments']: # 列表套字典格式 item = JingdongItem() try: item['content'] = comment['content'] item['creationTime'] = comment['creationTime'] item['productColor'] = comment['productColor'] item['productSize'] = comment['productSize'] item['userClientShow'] = comment['userClientShow'] item['userLevelName'] = comment['userLevelName'] # yield item except: continue
def parse(self, response): print("数据解析") count = 0 computers = response.selector.xpath(".//div[@id='J_goodsList']/ul[@class='gl-warp clearfix']/" "li[@class='gl-item']/div[@class='gl-i-wrap']") for computer in computers: item = JingdongItem() item["name"] = computer.xpath(".//div[@class='p-name']/a/em/text()").extract_first().replace('\n', '')\ .replace('\t', '') item['url'] = "https:" + computer.xpath(".//div[@class='p-name']/a/@href").extract_first() item['price'] = computer.xpath(".//div[@class='p-price']/strong/i/text()").extract_first() # item['image'] = "https:" + computer.xpath(".//div[@class='p-img']/a/img/@src").extract_first() item['comments'] = computer.xpath(".//div[@class='p-commit']/strong/a/text()").extract_first() count += 1 print(count) yield item
def parse_item(self, response): try: item = JingdongItem() thisUrl = response.url pat = 'item.jd.com/(.*?).html' x = re.search(pat, thisUrl) if x: thisid = re.compile(pat).findall(thisUrl)[0] # print(thisUrl) # print(thisid) title = response.xpath('//html/head/title/text()').extract() shop = response.xpath( "//*[@id='popbox']/div/div[1]/h3/a/text()").extract() shoplink = response.xpath( "//*[@id='popbox']/div/div[1]/h3/a/@href").extract() # print(title) # print(shop) # print(shoplink) priceUrl = "https://p.3.cn/prices/mgets?callback=jQuery6964855&type=1&area=1&pdtk=&pduid=50528027&pdpin=&pin=null&pdbp=0&ext=11000000&source=item-pc&skuIds=J_" + thisid commentUrl = "https://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv1463&score=0&sortType=5&page=0&pageSize=10&isShadowSku=0&fold=1&productId=" + thisid # print(priceUrl) # print(commentUrl) priceData = urllib.request.urlopen(priceUrl).read().decode( 'utf-8', 'ignore') commentData = urllib.request.urlopen(commentUrl).read().decode( 'utf-8', 'ignore') pricePat = '"p":"(.*?)"' commentPat = '"goodRateShow":(.*?),' price = re.compile(pricePat).findall(priceData) comment = re.compile(commentPat).findall(commentData) # print(price) # print(comment) # print('') if len(title) and len(shop) and len(shoplink) and len( price) and len(comment): item['title'] = title[0] item['shop'] = shop[0] item['shoplink'] = shoplink[0] item['price'] = price[0] item['comment'] = comment[0] else: pass else: pass return item except Exception as err: print(err)
def crawl_info(self, response): ''' 抓取页面内的书名,价格 评论数 :param response: :return: ''' item = JingdongItem() id = "12398725" # re.findall('\d+',response.url)[0] header = {'Referer': response.url} info_url = 'https://club.jd.com/comment/skuProductPageComments.action?callback=fetchJSON_comment98vv426&productId=' + str( id) + '&score=0&sortType=6&page=0&pageSize=10&isShadowSku=0&fold=1' resp = get(info_url, headers=header).content.decode('GBK') print(resp) data = re.sub('^fetchJSON_comment98vv\d+\(|\);', '', resp) pat = re.compile('\);').sub('', data) # data2 = data.sub('', pat).sub() elements = json.loads(pat) count_comment = int(elements['productCommentSummary']['commentCount']) print(count_comment) # if count_comment % 10 == 0: # page = count_comment // 10 # if count_comment % 10 != 10: # page = (count_comment // 10) + 1 for i in range(0, 20): info_url = 'https://club.jd.com/comment/skuProductPageComments.action?callback=fetchJSON_comment98vv426&productId=' + str( id) + '&score=0&sortType=6&page=' + str( i) + '&pageSize=10&isShadowSku=0&fold=1' resp = get(info_url, headers=header).content.decode('GBK') data = re.sub('^fetchJSON_comment98vv\d+\(|\);', '', resp) pat = re.compile('\);').sub('', data) # data2 = data.sub('', pat).sub() elements = json.loads(pat) for element in elements['comments']: item['user_name'] = element['nickname'] item['user_id'] = element['id'] item['id'] = element['referenceId'] item['good_name'] = element['referenceName'] item['score'] = element['score'] item['userLevelId'] = element['userLevelId'] item['userClientShow'] = element['referenceName'] item['isMobile'] = element['isMobile'] # print(item['user_name']) return item
def detailpage(self, response): item = JingdongItem() item['filename'] = response.meta['filename'] item['intruduce'] = response.meta['intruduce'] item['good_id'] = response.meta['good_id'] # 每种款式的id img_urls1 = response.xpath( '//div[@id="spec-list"]/ul/li/img/@src').extract() item['img_urls'] = self.deal_img(img_urls1) # 每种款式的所有图片 name = response.meta['name'] # 款式名称 if "/" in name: name = name.replace("/", "_") if ':' in name: name = name.replace(':', '') if '|' in name: name = name.replace('|', '_') item['img_name'] = name yield item
def parse(self, response): node_list = response.xpath('//div[@class="mc"]/dl/dt') # print(node_list) for node in node_list: item = JingdongItem() item["b_type"] = node.xpath('./a/text()').extract_first() s_tpye_list = node.xpath('./following-sibling::dd[1]/em') # print(s_tpye_list) for s_tpye in s_tpye_list: item["s_type"] = s_tpye.xpath('./a/text()').extract_first() s_tpye_url = "http:" + s_tpye.xpath( './a/@href').extract_first() # print(s_tpye_url) yield scrapy.Request(s_tpye_url, callback=self.book_list, meta={"item": item})
def parse(self, response): lis = response.xpath('//*[@id="plist"]/ul/li/div') item = JingdongItem() for li in lis: name = li.xpath('string(.//a/em)').extract_first().strip() price = li.xpath('string(./div/strong/i)').extract_first() discuss_num = li.xpath('string(./div[5]/strong/a)').extract_first() img = li.xpath('string(./div[1]/a/img/@src)').extract_first( ) if li.xpath( 'string(./div[1]/a/img/@src)').extract_first() else li.xpath( 'string(./div[1]/a/img/@data-lazy-img)').extract_first() shop_name = li.xpath('string(./div[7])').extract_first() item['name'] = name item['price'] = price item['discuss_num'] = discuss_num item['img'] = img item['shop_name'] = shop_name yield item
def next2(self, response): item = JingdongItem() item['title'] = response.xpath( '//head/title/text()').extract()[0].replace( '【图片 价格 品牌 报价】-京东', '').replace('【行情 报价 价格 评测】-京东', '') item['link'] = response.url #价格抓包 ture_id = re.findall(r'https://item.jd.com/(.*?).html', item['link'])[0] price_url = "https://p.3.cn/prices/mgets?skuIds=J_" + str(ture_id) price_txt = urllib.request.urlopen(price_url).read().decode( 'utf-8', 'ignore') item['price'] = re.findall(r'"p":"(.*?)"', price_txt)[0] #评论抓包 comment_url = "https://club.jd.com/comment/productCommentSummaries.action?referenceIds=" + str( ture_id) comment_txt = urllib.request.urlopen(comment_url).read().decode( 'utf-8', 'ignore') item['comment'] = re.findall(r'"CommentCount":(.*?),"', comment_txt)[0] return item
def get_page(self, response): #解析商品页 #获取商品名称 bookname = response.xpath("//title/text()").get().split(r"(")[0] #获取作者 author = response.xpath("//div[@class='p-author']/a/text()").get() #获取商品Id bookid = re.findall("https://item.jd.com/(.*?).html", str(response)) bookid = "".join(bookid) #通过调用json文件获取商品价格 price = self.get_book_price(bookid) #通过调用json文件获取商品评价数 commentcount = self.get_commentcount(bookid) #获取出版社 putlish = response.xpath( "//div[@class='p-parameter']//li/@title").get() item = JingdongItem() item["bookname"] = bookname item["author"] = author item["price"] = price item["commentcount"] = commentcount item["putlish"] = putlish item["bookurl"] = response.meta["bookurl"] yield item
def parse_comment(self, response): comments = json.loads(response.text) commentsCount = comments.get("CommentsCount")[0] comments_str = commentsCount.get("CommentCountStr") good_comments_str = commentsCount.get("GoodCountStr") good_comments_rate = commentsCount.get("GoodRate") poor_comments_str = commentsCount.get("PoorCountStr") poor_comments_rate = commentsCount.get("PoorRate") average_score = commentsCount.get("AverageScore") item = JingdongItem() p = response.meta item['pro_id'] = p.get("pro_id") item['head_img'] = p.get("head_img") item['pro_url'] = p.get("pro_url") item['pro_name'] = p.get("pro_name") item['shop_id'] = p.get("shop_id") if len(item['shop_id']) > 7: item['is_ziying'] = u"自营" else: item['is_ziying'] = u"非自营" item['shop_url'] = p.get("shop_url") item['category_1'] = p.get("category_1") item['category_2'] = p.get("category_2") item['category_3'] = p.get("category_3") item['pro_price'] = p.get("price") item['shop_name'] = p.get("shop_name") item['shop_score'] = p.get("shop_score") item['comments_str'] = comments_str item['good_comments_str'] = good_comments_str item['good_comments_rate'] = good_comments_rate item['poor_comments_str'] = poor_comments_str item['poor_comments_rate'] = poor_comments_rate item['average_score'] = average_score yield item
def parse_item(self, response): # i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract() # i['name'] = response.xpath('//div[@id="name"]').extract() # i['description'] = response.xpath('//div[@id="description"]').extract() item = JingdongItem() item["title"] = response.xpath( "//div[@class='p-name']/a/em/text()").extract() item["url"] = response.xpath( "//div[@class='p-name']/a/@href").extract() item["p_id"] = response.xpath( "//div[@class='p-operate']/a[@class='p-o-btn contrast J_contrast " "contrast-hide']/@data-sku").extract() priceurl = self.pricebaseurl commiturl = self.commitbaseurl for i in item["p_id"]: priceurl = priceurl + "J_" + i + "%2C" commiturl = commiturl + i + "," # print priceurl # print commiturl item["price"] = urllib2.urlopen(priceurl).read().decode( 'utf-8', 'ignore') item['commit'] = urllib2.urlopen(commiturl).read().decode( 'utf-8', 'ignore') yield item
def get_content(self,response): '''获取商品的详情信息''' item = JingdongItem() # 爬取时间 item['date'] = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time())) item['url'] = response.request.url item['shop_name'] = self.judje_res(response.xpath('//div[@class="w"]//div[@class="item"]/div[@class="name"]/a/text()').extract()) title = ''.join(response.xpath('//div[@class="w"]//div[@class="itemInfo-wrap"]/div[@class="sku-name"]/text()').extract()) item['goods'] = title.strip() item['brand'] = self.judje_res(response.xpath('//ul[@id="parameter-brand"]/li/a/text()').extract()) item['price'] = self.get_price(item['url']) comment = self.get_comment(item['url']) item['comment_count'] = comment[0] item['good_rate'] = comment[1] item['poor_rate'] = comment[2] item['select_shop'] = self.join_list(response.xpath('//div[@class="summary p-choose-wrap"]//div[@id="choose-attr-1"]/div[@class="dd"]/div/a/i/text()').extract()) item['image'] = self.join_list(response.xpath('//div[@id="spec-list"]/ul/li/img/@src').extract()) # 食品才有 pars = response.xpath('//div[@class="p-parameter"]/ul[2]/li') p = {} for par in pars: detail = par.xpath('./text()').get().split(':') p[detail[0]] = detail[1] item['weight'] = p.get('商品毛重','不存在') item['category'] = p.get('类别','不存在') item['sugar'] = p.get('是否含糖','不存在') item['fat'] = p.get('脂肪含量','不存在') item['addr'] = p.get('商品产地','不存在') print(item['url'],item['shop_name'],item['goods']) yield item