def parse(self, response): items = response.css('.gl-warp.clearfix > li.gl-item') for item in items: dic = JdItem() name = item.css('.p-name a i::text').extract_first() image = item.css( '.p-img a img::attr(source-data-lazy-img)').extract_first() price = item.css('.p-price i::text').extract_first() deal = item.css('.p-commit strong a::text').extract_first() shop = item.css('.p-shop a::text').extract_first() dic['name'] = name dic['image'] = image dic['price'] = price dic['deal'] = deal dic['shop'] = shop yield dic # for page in range(2,101): # url = self.start_urls[0] + '&page=%s' % page # # yield scrapy.Request(url=url, callback=self.parse) if self.page <= 100: self.page += 1 yield scrapy.Request(self.url + str(self.page), callback=self.parse, dont_filter=True)
def parse_item(self, response): reg_num = re.compile('page=(\d+?)') num = re.search(reg_num,response.url).group(1) print("正在翻页%s"%(num)) gl_items = response.xpath('//li[@class="gl-item"]') base_price_url = 'https://p.3.cn/prices/mgets?callback=jQuery%s&skuIds=J_%s&pduid=%s' for gl_item in gl_items: item = JdItem() #店铺名字 item['jd_shop_name'] = gl_item.xpath('./div/div[@class="p-shop"]/@data-shop_name').extract_first() #物品id item['product_id'] = gl_item.xpath('.//div/@data-sku').extract_first() #详情页面url item['jd_page_url'] = 'http://' + gl_item.xpath('.//div[@class="p-img"]/a/@href').extract_first() #每个商品价格url price_url = base_price_url%(item['product_id'],item['product_id'],random.randint(0,1000000)) #商品图片url jd_img_url = gl_item.xpath('.//img[@height="220"]/@src | .//img[@height="220"]/@data-lazy-img').extract_first() item['jd_img_url'] = 'https:' + jd_img_url yield Request(url=price_url,callback=self.parse_price,meta={'item':item})
def parse(self, response): selector = Selector(response) lis = selector.xpath('//ul[class="gl-warp clearfix"]/li') base_price_url = 'https://p.3.cn/prices/mgets?callback=jQuery%s&skuIds=J_%s' for li in lis: item = JdItem() item['name'] = li.xpath( 'div/div[class="p-img"]/a/@title').extract()[0] item['comment_num'] = li.xpath( 'div/div[class="p-commit"]/strong/a/text()').extract()[0] item['url'] = li.xpath( 'div/div[class="p-img"]/a/@href').extract()[0] item['info'] = li.xpath( 'div/div[class="p-name p-name-type-2"]/a/em/text()').extract( )[0] print('解析异常。。。。。') price_url = base_price_url % ( item['product_id'], item['product_id']) # 这样就获得了每一款sku的价格的链接 yield Request(url=price_url, callback=self.parse, meta={ 'item': item, 'dont_redirect': True, 'handle_httpstatus_list': [302] }) pass
def parse(self, response): products = response.xpath( '//div[@id="J_searchWrap"]//div[@id="J_goodsList"]//li') for product in products: item = JdItem() item['image'] = ''.join( product.xpath( './/div[@class="p-img"]//a/@href').extract()).strip() #dei item['price'] = ''.join( product.xpath('.//div[@class="p-price"]//i/text()').extract() ).strip() #dei item['shop'] = ''.join( product.xpath('.//div[contains(@class, "shop")]//a/text()'). extract()).strip() #dei item['title'] = ''.join( product.xpath('.//div[contains(@class,"p-name")]//em//text()'). extract()).strip() #dei item['deal'] = ''.join( product.xpath('.//div[@class="p-commit"]//text()').extract() ).strip() #dei yield item
def parse(self, response): # 获取一个页面中每本书的名字和价格 for sel in response.css('ul.gl-warp.clearfix > li.gl-item'): item = JdItem() name= sel.css('div.p-name').xpath('string(.//em)').extract_first() price= sel.css('div.p-price i::text').extract_first() try: remark=sel.xpath('.//div[(@class="p-commit" or @class="p-comm")]').xpath('string(.)').extract_first() if remark: remark=remark.strip() except: remark=None try: price=float(price) except: price=price # 自营 # shop=sel.css('div.p-shopnum span::text').extract_first() # 出版社 publish=sel.css('div.p-shopnum a::text').extract_first() if publish is None: publish=sel.css('div.p-shop a::text').extract_first() # if shop is None: # shop=sel.css('div.p-shopnum a::text').extract_first() # publish=None item['name']=name item['price']=price item['remark']=remark item['publish']=publish # item['shop']=shop yield item
def parse(self, response): # 实例化item item = JdItem() # 注意imgurls是一个集合也就是多张图片 imgurls = response.css(".post img::attr(src)").extract() item['imgurl'] = imgurls yield item
def parse(self, response): for el in response.css('.gl-item'): yield JdItem( url=el.css('.p-name > a::attr("href")').extract_first(), name=el.css('.p-name > a::attr("title")').extract_first(), price=float(el.css('.p-price i::text').extract_first()), )
def parse(self, response): print("正在抓取%s的页面内容" % (response.url)) self.page += 1 #获取包含商品的ul ul = response.xpath('//ul[contains(@class, "gl-warp")]/li') for li in ul: items = JdItem() items['shop_name'] = li.xpath( './div/div[@class="p-shop"]/@data-shop_name').extract_first() items['product_name'] = li.xpath( './div[1]/div[4]/a/em/text()').extract_first().strip() items['product_id'] = li.xpath( './div[@class="gl-i-wrap j-sku-item"]/@data-sku' ).extract_first() items['product_url'] = li.xpath( './div//div[@class="p-img"]/a/@href').extract_first() #获取价格,进入价格页面 url = 'https://p.3.cn/prices/mgets?callback=jQuery1493916&skuIds=J_' + items[ 'product_id'] + '&pduid=%s' % (time.time()) yield scrapy.Request(url=url, callback=self.parse_price, meta={"item": items}) #获取下一页(只抓取10页) if self.page < 10: try: next_url = response.xpath( '//a[@class="pn-next"]/@href').extract_first() yield scrapy.Request(url='https://list.jd.com' + next_url, callback=self.parse) except Exception as e: print("获取下一页链接失败>>%s" % e) else: print("抓取%s页数据结束!" % self.page)
def parse(self, response): ids = [] # with open('html_code.html', 'w', encoding='utf-8') as f: # f.write(response.text) # 有预售 class="gl-item gl-item-presell" data_list = response.xpath('//li[contains(@class,"gl-item")]') # 测试查看每页数据书否30 # print(len(data_list), self.page, response.url) for data in data_list: item = JdItem() # 产品多款data-pid 单款data-sku item_id = data.xpath('./@data-pid').extract() if not item_id: item_id = data.xpath('./@data-sku').extract() item_name = data.xpath( './/div[contains(@class,"p-name")]//em/text()[1]').extract() item_price = data.xpath( './/div[@class="p-price"]//i/text()').extract() if item_id: item['item_id'] = item_id[0] item_url = 'https://item.jd.com/' + item_id[0] + '.html' item['item_url'] = item_url if item_name: item['item_name'] = item_name[0] if item_price: item['item_price'] = item_price[0] # item['page'] = self.page yield item headers = {'referer': response.url} self.page += 1 self.s += 30 url = self.next_url % (self.keyword, self.page, self.s, ','.join(ids)) yield scrapy.Request(url, callback=self.next_parse, headers=headers)
def next_parse(self, response): data_list = response.xpath('//li[contains(@class,"gl-item")]') # print(len(data_list), self.page, response.url) for data in data_list: item = JdItem() item_id = data.xpath('./@data-pid').extract() if not item_id: item_id = data.xpath('./@data-sku').extract() item_name = data.xpath( './/div[contains(@class,"p-name")]//em/text()[1]').extract() item_price = data.xpath( './/div[@class="p-price"]//i/text()').extract() if item_id: item['item_id'] = item_id[0] item_url = 'https://item.jd.com/' + item_id[0] + '.html' item['item_url'] = item_url if item_name: item['item_name'] = item_name[0] if item_price: item['item_price'] = item_price[0] # item['page'] = self.page yield item if self.page < 200: self.page += 1 self.s += 30 url = self.url % (self.keyword, self.page, self.s) yield scrapy.Request(url, callback=self.parse) # scrapy.cmdline.execute(['scrapy', 'crawl', 'jd_scrapy_redis', '-o', 'wiki.csv', '-t', 'csv'])
def parse(self, response): contents = response.xpath( '//div[@id="J_goodsList"]//li[@class="gl-item"]') #contents = response.xpath('//i[@class="goods-icons4 J-picon-tips"]/text()') for content in contents: item = JdItem() #r = content.extract() eachs = content.xpath( './/i[@class="goods-icons4 J-picon-tips"]/text()') for each in eachs: result = each.extract() if result == self.flag: item['price'] = content.xpath( './/div[@class="p-price"]//i/text()').extract()[0] name_pre = content.xpath( './/div[@class="p-name"]//font/text()').extract() if len(name_pre) > 0: name_pre = name_pre[0] item['name'] = name_pre + content.xpath( './/div[@class="p-name p-name-type-2"]//em/text()' ).extract()[0] else: item['name'] = content.xpath( './/div[@class="p-name p-name-type-2"]//em/text()' ).extract()[0] yield item if self.page < 200: self.page += 1 yield scrapy.Request(self.url + str(self.page), callback=self.parse)
def parse(self, response): products = response.xpath( './/div[@id="plist"]/ul[contains(@class, "gl-warp")]/li[@class="gl-item"]' ) # print('products\t\t',products) for product in products: item = JdItem() item['name'] = product.xpath( './/div[@class="p-name"]/a/em/text()').extract_first().strip() item['url'] = 'https:' + product.xpath( './/div[@class="p-name"]/a/@href').extract_first() item['price'] = product.xpath( './/div[@class="p-price"]/strong[@class="J_price"][1]/i/text()' ).extract_first() item['comment'] = product.xpath( './/a[@class="comment"]/text()').extract_first() item['shop'] = product.xpath( './/div[@class="p-shop"]/span/a/text()').extract_first() # print(item) yield item next_url = response.xpath( './/a[@class="pn-next"]/@href').extract_first() if next_url != None: url = response.urljoin(next_url) yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response): sel = scrapy.Selector(response) # url="https://item.jd.hk/18739277759.html" #京东全球购与普通网址不同,不同的地方为“https://item.jd.com/4251335.html” goods_info = sel.xpath(".//div[@id='plist']/ul/li") for goods in goods_info: ProductID = goods.xpath( ".//div[@class='gl-i-wrap j-sku-item']/@data-sku").extract()[ 0] #商品编号 if len(ProductID) != 0: goods_web = "https://item.jd.com/" + str( ProductID) + ".html" #商品链接 包含商品型号,店铺名称,类别,品牌,型号等 item = JdItem(ProductID=ProductID) request = scrapy.Request(url=goods_web, callback=self.goods, meta={'item': item}, dont_filter=True) yield request else: print("parse中ProductID为空 没有读到") # #测试用 # ProductID='1552845' # item = JdtestItem(ProductID=ProductID) # # url="https://item.jd.hk/1971910764.html" # url="https://item.jd.com/1552845.html" # request = scrapy.Request(url=url, callback=self.goods,meta={'item':item}, dont_filter=True) # yield request #翻页功能 next_page = sel.xpath( ".//div[@class='p-wrap']/span[@class='p-num']/a[@class='pn-next']/@href" ).extract() if next_page: next = "https://list.jd.com/" + next_page[0] yield scrapy.Request(next, callback=self.parse)
def parse(self, response): # 解码方式是GBK,默认utf8会显示乱码 result = json.loads(response.body.decode('GBK')) item = JdItem() if result['data']: # 分类都在datas里面 datas = result["data"] # 首先是大分类 for data in datas: b_cates = data['s'][0] b_cate = b_cates["n"] # print("大分类:{}".format(b_cate)) # 专门写个函数来获得分类的url和名称,把分类传给该函数 item["b_cate_url"], item["b_cate_name"] = self.get_info_cate( b_cate) # 中分类在大分类下面一级,和n同一级 m_catess = b_cates['s'] # 中分类下有很多分类,先遍历中分类 for m_cates in m_catess: m_cate = m_cates["n"] # print("中分类:{}".format(m_cate)) # 调用分类函数 item["m_cate_url"], item[ "m_cate_name"] = self.get_info_cate(m_cate) # 小分类在中分类下和n同一级 s_catess = m_cates['s'] # 遍历小分类 for s_cates in s_catess: s_cate = s_cates["n"] # print("小分类:{}".format(s_cate)) item["s_cate_url"], item[ "s_cate_name"] = self.get_info_cate(s_cate) # print(item) yield item
def parse(self, response): #商品列表 products = response.xpath('//ul[@class="gl-warp clearfix"]/li') #列表迭代 for product in products: item = JdItem() try: name = ''.join( product.xpath( './/div[@class="p-name p-name-type-2"]/a/em/text()'). extract()).strip().replace(' ', '') except: name = '' try: price = product.xpath( './/div[@class="p-price"]//i/text()').extract()[0] except: price = '' try: store = product.xpath( './/div[@class="p-shop"]//a/@title').extract()[0] except: store = '' try: evaluate_num = product.xpath( './/div[@class="p-commit"]/strong/a/text()').extract()[0] except: evaluate_num = '' try: detail_url = product.xpath( './/div[@class="p-name p-name-type-2"]/a/@href').extract( )[0] except: detail_url = '' try: if product.xpath('.//div[@class="p-icons"]/i/text()').extract( )[0] == '自营': support = '自营' else: support = '非自营' except: support = '非自营' item['name'] = name item['price'] = price item['store'] = store item['evaluate_num'] = evaluate_num item['detail_url'] = detail_url item['support'] = support yield item print(item) if self.page < 100: self.page += 1 print(self.page) yield scrapy.Request(url=self.base_url, callback=self.parse, meta={'page': self.page}, dont_filter=True)
def parse_shop(self, response): item = JdItem() sel = Selector(response) shop_title = sel.xpath('//div[@class="J-hove-wrap EDropdown fr"]/div/div/a/text()').extract_first() shop_url = 'https:' + sel.xpath('//div[@class="J-hove-wrap EDropdown fr"]/div/div/a/@href').extract_first() item['title'] = shop_title item['url'] = shop_url yield item
def parse_page(self, response): item_list = response.css('#plist > ul > li > div') data = [] tmp_count = 0 check_chat = 'https://club.jd.com/comment/productCommentSummaries.action?my=pinglun&referenceIds=' for one in item_list: tmp_count += 1 my_items = JdItem() my_items['SkuId'] = one.css('::attr(data-sku)').extract_first() my_items['img'] = one.css( 'div.p-img > a::attr(href)').extract_first() my_items['name'] = one.css( 'div.p-name > a > em::text').extract_first() my_items['url'] = one.css( 'div.p-name > a::attr(href)').extract_first() try: tmp = one.css('div.p-icons.J-pro-icons > img::attr(data-tips)' ).extract_first() my_items['is_jd'] = True except Exception as e: self.logger.info(e) my_items['is_jd'] = False data.append(my_items) if tmp_count >= 30: check_chat = ','.join([ check_chat, my_items['SkuId'] + '&callback=jQuery270940&_=1492343539522' ]) yield scrapy.Request(url=check_chat, callback=self.parse_one, meta={ 'cookies': False, 'data': data }) check_chat = 'https://club.jd.com/comment/productCommentSummaries.action?my=pinglun&referenceIds=' tmp_count = 0 data = [] else: if tmp_count == 1: check_chat = ''.join([check_chat, my_items['SkuId']]) else: check_chat = ','.join([check_chat, my_items['SkuId']]) next_page = response.css( '#J_bottomPage > span.p-num > a.pn-next::attr(href)' ).extract_first() if next_page: self.logger.info('next {}'.format(next_page)) url = ''.join(['https://list.jd.com', next_page]) yield scrapy.Request(url=url, callback=self.parse_page, meta={ 'authority': 'list.jd.com', 'method': 'GET', 'path': next_page, 'scheme': 'https', 'cookies': True })
def parse(self, response): comments_json = response.text[len('fetchJSON_comment98vv'+self.commentVersion+'('):-2] productCommentSummary = json.loads(comments_json).get('productCommentSummary') comments = json.loads(comments_json).get('comments') item=JdItem() item['goodRateShow']=productCommentSummary.get('goodRateShow') item['generalRateShow']=productCommentSummary.get('generalRateShow') item['poorRateShow']=productCommentSummary.get('poorRateShow') item['commentCount']=productCommentSummary.get('commentCount') item['productId']=productCommentSummary.get('productId') item['referenceName']=[] item['referenceId']=[] item['content']=[] item['creationTime']=[] item['nickname']=[] item['userLevelName']=[] item['userClientShow']=[] item['id']=[] item['score']=[] item['guid']=[] for comment in comments: # 商品名称 item['referenceName'].append(comment.get('referenceName')) # 商品ID item['referenceId'].append(comment.get('referenceId')) # 评论内容 item['content'].append(comment.get('content')) # 评论时间 item['creationTime'].append(comment.get('creationTime')) # 评论人昵称 item['nickname'].append(comment.get('nickname')) # 顾客会员等级 item['userLevelName'].append(comment.get('userLevelName')) # 购物使用的平台 item['userClientShow'].append(comment.get('userClientShow')) # 用户id item['id'].append(comment.get('id')) # 评分 item['score'].append(comment.get('score')) # guid"13cd40e8-93b0-4078-96e9-d33748566516" item['guid'].append(comment.get('guid')) yield item maxPage=json.loads(comments_json).get('productCommentSummary').get('commentCount') if maxPage % 10 == 0: # 算出评论的页数,一页10条评论 page = maxPage/10 else: page = maxPage/10 + 1 for k in range(1,50): yield Request(self.generate_product_comment_url(self.commentVersion,self.productID,k),callback=self.parse)
def parse(self, response): results = response.css('#plist ul.gl-warp li.gl-item') for result in results: item = JdItem() item['title'] = result.css('div.p-name em::text').extract_first() item['price'] = result.css('div.p-price i::text').extract_first() item['commit'] = result.css('div.p-name i::text').extract_first() item['img'] = result.css('div.p-img img::attr(src)').extract_first() item['shop'] = result.css('div.p-shop a::attr(title)').extract_first() yield item
def parse_detail(self, response): json_data = response.body.decode("GBK") print(json_data) item = JdItem() meta = response.meta item["title"] = meta["title"] item["merchant_name"] = meta["merchant_name"] item["price"] = meta["price"] item["commodity_id"] = meta["commodity_id"] item["comment_count"] = json_data yield item
def parse_comment(self, response): item = JdItem() meta = response.meta response_json = demjson.decode(txt=response.text, encoding='utf-8') if meta['page'] < int(response_json['maxPage']) and meta['page'] < 100: meta['page'] += 1 url = u'='.join(response._url.split(u'=')[:-1])+u'='+str(meta['page']) yield Request(url, meta=meta, callback=self.parse_comment) for c in response_json['comments']: content = ''.join(c['content']).strip().replace(u'\n', u'').replace(u'\r', u'').encode('utf-8') item['info'] = '{} {}'.format(meta['score'], content) yield item
def parse_item(self, response): try: i = JdItem() thisurl = response.url pat = 'item.jd.com.*?(\d+).html' x = re.search(pat, thisurl) if x: thisid = re.compile(pat).findall(thisurl)[0] print(thisid) title = response.xpath('//html/head/title/text()').extract() shop = response.xpath( '//div[@class="name"]/a/text()').extract() shoplink = response.xpath( '//div[@class="name"]/a/@href').extract() priceurl = 'https://p.3.cn/prices/mgets?callback=jQuery8766554&type=1&area=1_72_4137_0&pdtk=&pduid=15048784180911725795195&pdpin=&pin=null&pdbp=0&skuIds=J_' + str( thisid) commenturl = 'https://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv10&productId=' + str( thisid ) + '&score=0&sortType=5&page=3&pageSize=10&isShadowSku=100001918171&rid=0&fold=1' #print(priceurl) #print(commenturl) pricedata = urllib.request.urlopen(priceurl).read().decode( 'utf-8', 'ignore') commentdata = urllib.request.urlopen(commenturl).read().decode( 'utf-8', 'ignore') pricepat = '"p":"(.*?)"' commentpat = '"goodRateShow":(.*?),' price = re.compile(pricepat).findall(pricedata) comment = re.compile(commentpat).findall(commentdata) if (len(title)) and (len(shop)) and (len(shoplink)) and ( len(price)) and (len(comment)): i['goodsid'] = thisid i['title'] = title[0] i['shop'] = shop[0] i['goodslink'] = thisurl i['shoplink'] = 'https:' + shoplink[0] i['price'] = price[0] i['goodrate'] = comment[0] print(title[0]) print(thisurl) print(shop[0]) print('https:' + shoplink[0]) print(price[0]) print(comment[0]) print('-----------') else: pass else: pass yield i except Exception as e: print(e)
def parse(self, response): """ 爬取每页的前三十个商品,数据直接展示在原网页中 :param response: :return: """ ids = [] x = 0 for li in response.xpath('//*[@id="J_goodsList"]/ul/li'): x = x + 1 item = JdItem() price = li.xpath('div/div/strong/i/text()').extract() # 价格 shop = li.xpath( 'div/div[@class="p-shop"]/span/a/text()').extract() # 店铺 tags = li.xpath( 'div/div[@class="p-icons"]/i/text()').extract() # 标签 title = li.xpath('div/div/a/em/text()').extract() # 标题 id = li.xpath('@data-pid').extract() # id ids.append(''.join(id)) url = li.xpath('div/div[@class="p-name p-name-type-2"]/a/@href' ).extract() # 需要跟进的链接 item['title'] = ''.join(title) item['keyword'] = ''.join(self.keyword) item['shop'] = ''.join(shop) item['price'] = ''.join(price) item['tags'] = ''.join(tags) item['url'] = ''.join(url) if item['url'].startswith('//'): item['url'] = 'https:' + item['url'] yield item print('京东采集 :' + self.keyword + ' 显示页面已采集' + str(x) + '条,' + 'Page = ' + str(self.page)) if x < 1: self.crawler.engine.close_spider(self, '已爬取所有信息!') else: headers = {'referer': response.url} # 后三十页的链接访问会检查referer,referer是就是本页的实际链接 # referer错误会跳转到:https://www.jd.com/?se=deny self.page += 1 yield scrapy.Request( self.next_url % (self.keyword, self.keyword, self.page, ','.join(ids)), callback=self.next_parse, headers=headers)
def parse(self, response): products = response.xpath( '//div[@id="J_goodsList"]//li[@class="gl-item"]/div[contains(@class, "gl-i-wrap")]') print('-' * 20) print(len(products)) for product in products: item = JdItem() item['price'] = ''.join(product.xpath('.//div[contains(@class, "p-price")]//i[1]/text()').extract()).strip() item['title'] = ''.join(product.xpath('.//div[contains(@class, "p-name")]//text()').extract()).strip() # item['shop'] = ''.join(product.xpath('.//div[contains(@class, "shop")]//text()').extract()).strip() # item['image'] = ''.join(product.xpath('.//div[@class="pic"]//img[contains(@class, "img")]/@data-src').extract()).strip() # item['deal'] = product.xpath('.//div[contains(@class, "deal-cnt")]//text()').extract_first() # item['location'] = product.xpath('.//div[contains(@class, "location")]//text()').extract_first() yield item
def parse(self, response): goods = response.xpath('//li[@class="gl-item"]') for good in goods: item1 = JdItem() gid = good.xpath('./@data-sku').extract_first() link = f'https://item.jd.com/{gid}.html' item1['good_ID'] = gid item1['title'] = good.xpath( 'string(.//div[@class="p-name p-name-type-2"]//em)' ).extract_first() item1['link'] = link item1['price'] = good.xpath( './/div[@class="p-price"]//i//text()').extract_first() yield scrapy.Request(url=link, meta={'item': item1}, callback=self.parse_detail)
def parse_item(self, response): res = response c = response.meta item = JdItem() bs4_response = BeautifulSoup( res.text.encode('gbk', 'ignore').decode('gbk', errors='ignore').replace( ' ', ''), 'lxml') jd = bs4_response.find("div", { "class": "job_part" }).find("div", { "class": "job_detail" }).get_text() jd = re.sub('\n+', '\n', jd, re.S).strip("\n").strip() try: r = c['cate'] mkdir(self.FolderName + '/' + r) item['jd'] = re.sub('\s+', '', jd, re.S) item['job_title'] = bs4_response.find("div", { "class": "new_job_name" }).get_text().strip() #print(item['jd']) global cn cn += 1 print(cn) path = str(r) + '/' + str(cn) + '.txt' with open(self.FolderName + '/' + str(r) + '/' + str(cn) + '.txt', 'a', encoding="utf-8") as fw: fw.write(item['jd']) fw.close() print(item['jd']) JT = [item['job_title'], path] csv_writer = csv.writer(self.out, dialect='excel') csv_writer.writerow(JT) ''' with open(str(cn) + '.txt', 'a', encoding='utf-8') as fwe: fwe.write(item['jd']) ''' if '@' in item['jd']: with open(str(cn) + '.txt', 'a', encoding='utf-8') as fwe: fwe.write(item['jd']) yield item except: print('职位爬取失败。。') print('失败页数:', self.PAGES)
def parse(self, response): goodslist = response.xpath('//ul[@class="gl-warp.clearfix"]//li') if list: for onegoods in goodslist: item = JdItem() item['price'] = ''.join( onegoods.xpath('.//div[@class="p-price"]/strong/text()'). extract()).strip() item['title'] = ''.join( onegoods.xpath('.//div[@class="p-name"]//em/text()'). extract()).strip() item['comment'] = ''.join( onegoods.xpath('.//div[@class="p-commit"]/strong/text()'). extract()).strip() item['boss'] = ''.join( onegoods.xpath('.//div[@class="p-shopnum"]/a/text()'). extract()).strip() yield item
def parse_item(self, response): try: item = JdItem() thisurl = response.url pat = "item.jd.com/(.*?).html" x = re.search(pat, thisurl) if (x): # 获取商品ID thisid = re.compile(pat).findall(thisurl)[0] # 获取商品title item['title'] = response.xpath( "//div[@class='sku-name']/text()").extract() # 获取商品 item['shop'] = response.xpath( "//div[@class='name']/a/text()").extract() # 商品链接 item['shoplink'] = response.xpath( "//div[@class='name']/a/@href").extract() # 价格接口链接 priceurl = "https://p.3.cn/prices/get?type=1&area=1_72_2799&pdtk=&pduid=1888909243&pdpin=&pdbp=0&skuid=J_" + str( thisid) + "&callback=cnp" # 获取价格 pricedata = urllib.request.urlopen(priceurl).read().decode( "utf-8", "ignore") pricepat = '"p":"(.*?)"' item['price'] = re.compile(pricepat).findall(pricedata) # 评论接口链接 commenturl = "https://sclub.jd.com/comment/productPageComments.action?productId=" + str( thisid ) + "&score=0&sortType=3&page=0&pageSize=10&callback=fetchJSON_comment98vv4956" # 获取评论 commentdata = urllib.request.urlopen(commenturl).read().decode( "utf-8", "ignore") commentpat = 'goodRateShow":(.*?),' item['comment'] = re.compile(commentpat).findall(commentdata) yield item except Exception as e: print(e)
def parse_detail(self, response): sel = response pageindex = response.meta['pageindex'] print( "-------------------------------response-----------------------------------" ) price = sel.xpath( '*//div[@id="J_goodsList"]//div[@class="p-price"]//i/text()' ).extract() name = sel.xpath( '*//div[@id="J_goodsList"]//div[@class="p-name p-name-type-3"]//em/text()' ).extract() shopid = sel.xpath( '*//div[@id="J_goodsList"]//div[@class="p-shop"]').extract() picture = sel.xpath( '*//div[@id="J_goodsList"]//div[@class="p-img"]//img/@data-lazy-img' ).extract() productId = sel.xpath( '*//div[@id="J_goodsList"]//li[@class="gl-item"]/@data-sku' ).extract() for i in range(0, len(price)): item = JdItem() item["pageindex"] = pageindex item['index'] = str(i + 1) item['ok'] = 'False' item['价格'] = price[i] item['品牌'] = name[i] shop = re.compile(r'title="(.+?)">').findall(shopid[i]) if (len(shop) == 0): shop = "" else: shop = shop[0] item['店铺'] = shop item['图片'] = picture[i] item['ID'] = productId[i] item['page_url'] = response.url item['url'] = "https://item.jd.com/" + item['ID'] + ".html" json_url = "https://club.jd.com/comment/productCommentSummaries.action?referenceIds=" + item[ "ID"] yield Request(url=json_url, callback=self.parse_json, meta={'item': item}, headers=self.default_headers)
def next_parse(self, response): """ 爬取每页的后三十个商品,数据展示在一个特殊链接中:url+id(这个id是前三十个商品的id) :param response: :return: """ y = 0 for li in response.xpath('//li[@class="gl-item"]'): y = y + 1 item = JdItem() price = li.xpath('div/div/strong/i/text()').extract() # 价格 shop = li.xpath( 'div/div[@class="p-shop"]/span/a/text()').extract() # 店铺 tags = li.xpath( 'div/div[@class="p-icons"]/i/text()').extract() # 标签 title = li.xpath('div/div/a/em/text()').extract() # 标题 url = li.xpath('div/div[@class="p-name p-name-type-2"]/a/@href' ).extract() # 需要跟进的链接 item['title'] = ''.join(title) item['keyword'] = ''.join(self.keyword) item['shop'] = ''.join(shop) item['price'] = ''.join(price) item['tags'] = ''.join(tags) item['url'] = ''.join(url) if item['url'].startswith('//'): item['url'] = 'https:' + item['url'] yield item # print(item) print('京东采集 :' + self.keyword + ' 隐藏页面已采集' + str(y) + '条,' + 'Page = ' + str(self.page)) if y < 1: self.crawler.engine.close_spider(self, '已爬取所有信息!') else: #if self.page < 200: self.page += 1 yield scrapy.Request(self.url % (self.keyword, self.keyword, self.page), callback=self.parse)