def parse(self, response): item = AutopjtItem() #next_url = response.xpath("//li/a[@class='img']/@href").extract() item['link'] = response.xpath("//li/a[@class='img']/@href").extract() for url in item[ 'link']: #因为Request中的第一个参数必须是字符串,也就是默认是一个url。所以,不能直接传item['link']这个列表 yield scrapy.Request( url, meta={'url': url}, callback=self.parse2) #传url给回调函数,而不是item。因为,这里基本没用item的其他属性。
def parse(self, response): item = AutopjtItem( ) #其实如果下面不用item['link']存放新的url。这里是不用实例化item的。如果父页面中也有要提取的数据, #(该数据!= 传下去的url)那么最好就是这里在for循环中实例化item,然后将整个都传下去。 item['link'] = response.xpath("//li/a[@class='img']/@href").extract() for url in item[ 'link']: #因为Request中的第一个参数必须是字符串,也就是默认是一个url。所以,不能直接传item['link']这个列表 yield scrapy.Request( url, meta={'url': url}, callback=self.parse2) #传url给回调函数,而不是item。因为,这里基本没用item的其他属性。
def parse(self, response): new_url = response.xpath("//li/a[@class='img']/@href").extract() for i in range( 0, len(new_url) ): #上面分析得出的每一个url都需要单独爬取一次,也就是爬取的最小单元,所以要给每一次爬取都初始化一次item来存放数据。 item = AutopjtItem() item['link'] = new_url[i] #给item['link']赋值,赋给要进入下一层的url。 yield scrapy.Request(item['link'], meta={'item': item}, callback=self.parse2)
def parse(self, response): print("work") item = AutopjtItem() item["name"] = response.xpath("//a[@class='pic']").extract() #item["price"] = response.xpath("//span[@class='price_n']/text()").extract() #item["link"] = response.xpath("//a[@class='pic']/@href").extract() #item["comnum"] = response.xpath("//a[@name='P_pl']/text()").extract() for i in range(len(item["name"])): print(item["name"][i]) #print(item["name"]) '''yield item
def parse2(self, response): item = AutopjtItem() #接受上一个函数传入的对应的参数item。 item['name'] = response.xpath( "//div[@class='name_info']/h1/text()").extract() item['price'] = response.xpath("//p[@id='dd-price']/text()").extract() #item['link'] = response.xpath() item['comnum'] = response.xpath( "//div[@class='describe_detail']/span/text()").extract() item['link'] = response.meta['url'] #这里接受了从parse中传下来的url,并且修改了它, return item
def parse(self, response): item = AutopjtItem() item['name'] = response.xpath('//a[@class="pic"]/@title').extract() item['price'] = response.xpath('//span[@class="price_n"]/text()').extract() item['link'] = response.xpath('//a[@class="pic"]/@href').extract() item['comnum'] = response.xpath('//a[@dd_name="单品评论"]/text()').extract() yield item for i in range(1,5): url = "http://category.dangdang.com/pg"+str(i)+"-cid4011034.html" yield Request(url,callback=self.parse)
def parse(self, response): item = AutopjtItem() item['name'] = response.xpath("//a[@class='pic']/@title").extract() item['price'] = response.xpath( "//span[@class='price_n']/text()").extract() item['link'] = response.xpath("//a[@class='pic']/@href").extract() item['comnum'] = response.xpath( "//a[@name='itemlist-review']/text()").extract() yield item for i in range(1, 6): url = f'http://category.dangdang.com/pg{i}-cp01.41.00.00.00.00-shbig.html' yield Request(url, callback=self.parse)
def parse(self, response): item = AutopjtItem() item["name"] = response.xpath('//a[@class="pic"]/@title').extract() item["price"] = response.xpath( '//span[@class="price_n"]/text()').extract() item["link"] = response.xpath('//a[@class="pic"]/@href').extract() item["comnum"] = response.xpath( '//a[@name="itemlist-review"]/text()').extract() yield item for i in range(1, 46): url = "http://category.dangdang.com/pg" + str( i) + "-cid4002203.html" yield Request(url, callback=self.parse)
def parse(self, response): item = AutopjtItem() item["name"] = response.xpath("//a[@class='pic']/@title").extract() item["price"] = response.xpath( "//span[@class='search_now_price']/text()").extract() item["link"] = response.xpath("//a[@class='pic']/@href").extract() item["comnum"] = response.xpath( "//a[@class='search_comment_num']/text()").extract() yield item for i in range(1, 76): url = "http://category.dangdang.com/pg" + str( i) + "-cp01.19.10.00.00.00.html" yield Request(url, callback=self.parse)
def parse(self, response): # pass item = AutopjtItem() item["name"] = response.xpath("//a[@class='pic']/@title").extract() item["price"] = response.xpath( "//span[@class='price_n']/text()").extract() item["link"] = response.xpath("//a[@class='pic']/@href").extract() item["comnum"] = response.xpath("//a[@name='P_pl']/text()").extract() yield item for i in range(1, 76): url = "http://category.dangdang.com/pg" + str( i) + "-cid4002203.html" yield Request(url, callback=self.parse)
def parse(self, response): item = AutopjtItem() item['name'] = response.xpath("//a[@class = 'pic']/@title").extract() item['price'] = response.xpath( "//span[@class = 'price_n']/text()").extract() item['link'] = response.xpath("//a[@class = 'pic']/@href").extract() item['comnum'] = response.xpath( "//a[@name = 'itemlist-review']/text()").extract() yield item for i in range(1, 18): url = "http://category.dangdang.com/pg" + str( i) + "-cid4011029.html" yield Request(url, callback=self.parse)
def parse(self, response): item = AutopjtItem() item['name'] = response.xpath('//a[@class="pic"]/@title').extract() print(item['name']) item['price'] = response.xpath( '//span[@class="price_n"]/text()').extract() print(item['price']) item['link'] = response.xpath('//a[@class="pic"]/@href').extract() print(item['link']) # item['comum'] = response.xpath('//a[@name="itemlist-review"]/text()').extract() # print(item['comum']) yield item for i in range(1, 5): url = 'http://category.dangdang.com/pg' + str( i) + '-cid4011029.html' yield Request(url, callback=self.parse)
def parse(self, response): item = AutopjtItem() # 通过各Xpath表达式分别提取商品的名称、价格、链接、评论数等信息 item["name"] = response.xpath("//a[@class='pic']/@title").extract() item["price"] = response.xpath( "//span[@class='price_n']/text()").extract() item["link"] = response.xpath("//a[@class='pic']/@href").extract() item["comnum"] = response.xpath( "//a[@name='itemlist-review']/text()").extract() yield item for i in range(1, 27): url = 'http://category.dangdang.com/pg' + str( i) + '-cid4011029.html' yield Request(url, callback=self.parse)
def parse(self, response): item=AutopjtItem() #通过各Xpath表达式分别提取商品的名称、价格、链接、评论数等信息 item["name"]=response.xpath("//a[@class='pic']/@title").extract() item["price"]=response.xpath("//span[@class='price_n']/text()").extract() item["link"]=response.xpath("//a[@class='pic']/@href").extract() item["comnum"]=response.xpath("//a[@name='P_pl']/text()").extract() #提取完后返回item yield item #接下来很关键,通过循环自动爬取75页的数据 for i in range(1,76): #通过上面总结的网址格式构造要爬取的网址 url="http://category.dangdang.com/pg"+str(i)+"-cid4002203.html" #通过yield返回Request,并指定要爬取的网址和回调函数 #实现自动爬取 yield Request(url, callback=self.parse)
def parse(self, response): item = AutopjtItem() # 通过各XPath表达式分别提取商品的名称、价格、链接、评论数等信息 item['name'] = response.xpath('//a[@class="pic"]/@title').extract() item['price'] = response.xpath( '//span[@class="price_n"]/text()').extract() item['link'] = response.xpath('//a[@class="pic"]/@href').extract() item['comnum'] = response.xpath( '//a[@id="comm_num_down"]/text()').extract() # 提取完后返回item yield item # 接下来很关键,通过循环自动爬取75页的数据 for i in range(1, 76): # 通过上面总结的网址格式构造要爬取的网址 url = 'http://category.dangdang.com/pg%s-cid10010584.html' % i # 通过yield返回Request, 并指定要爬取的网址和回调函数,实现自动爬取 yield Request(url, callback=self.parse)
def parse(self, response): item = AutopjtItem() #通过各XPath表达式分别提取商品的名称,价格,链接,评论数等信息 item["name"] = response.xpath("//a[@class='pic']/@title").extract() item["price"] = response.xpath( "//span[@class='price_n']/text()").extract() item["link"] = response.xpath("//a[@class='pic']/@href").extract() item["comnum"] = response.xpath( "//a[@dd_name='单品评论']/text()").extract() #提取后返回item yield item #接下来通过循环自动爬取31页的数据 for i in range(1, 31): url = "http://category.dangdang.com/pg" + str( i) + "-cid10010056.html" #通过yield返回Request,并指定要爬取的网址和回调函数 #实现自动爬取 yield Request(url, callback=self.parse)
def parse(self, response): item = AutopjtItem() item["mingcheng"] = response.xpath( '//h3[@class="yxk_dataList_title"]/a/text()').extract() item["yibaomingrenshu"] = response.xpath( '//div[@class="yxk_dataList_info_bottom fn-clear"]/span[1]/text()' ).extract() item["price"] = response.xpath( '//a[@class="yxk_dataList_info_price"]/text()[2]').extract() item["link"] = response.xpath( '//a[@class="yxk_dataList_info_price"]/@href').extract() item["remark"] = response.xpath( '//p[@class="yxk_dataList_msg"]/text()').extract() yield item for i in range(2, 8): url = "http://www.youxiake.com/lines/list?code=around&place_id=2&days=1&month=0&holiday=0&price=0&tag=0&class_id=0>s=0>e=0&sdel=1&p=" + str( i) yield Request(url, callback=self.parse)
def parse(self, response): item = AutopjtItem() # 通过个XPath表达式分别提取商品的名称、价格、链接、评论数等信息 item['name'] = response.xpath('//a[@class="pic"]/@title').extract() item['price'] = response.xpath( '//span[@class="price_n"]/text()').extract() item['link'] = response.xpath('//a[@class="pic"]/@href').extract() item['comnum'] = response.xpath( '//a[@name="itemlist-review"]/text()').extract() # 提取完返回item yield item # 接下来很关键,通过循环自动爬取1~3页 for i in range(1, 3): url = "http://category.dangdang.com/pg" + str( i) + "-cid4003599.html" # 通过yield返回Request,并指定要爬取的网址的回调函数 # 实现自动爬取 yield Request(url, callback=self.parse)
def parse(self, response): item = AutopjtItem() item["mingcheng"] = response.xpath('//title/text()').extract() item["chuanyuefangshi"] = response.xpath( u'//*[contains(text(), "穿越方式:")]/text()').extract() item["tubujuli"] = response.xpath( u'//*[contains(text(), "徒步距离:")]/text()').extract() item["leijipashen"] = response.xpath( u'//*[contains(text(), "累计爬升:")]/text()').extract() item["fengjingzhishu"] = response.xpath( u'//*[contains(text(), "风景指数:")]/text()').extract() item["huodongqiangdu"] = response.xpath( u'//*[contains(text(), "活动强度:")]/text()').extract() item["lianjiedizhi"] = response.xpath('//link[1]/@href').extract() item["yibaomingrenshu"] = response.xpath( u'//dl[@class="nums mtw"]/dd/em/text()').extract() yield item for i in range(3391, 3470): url = "http://www.kxhuwai.com/thread-" + str(i) + "-1-1.html" yield Request(url, callback=self.parse)
def parse(self, response): item = AutopjtItem() #通过xpath表达式提取商品名称、价格、链接、评论数量 item['name'] = response.xpath("//a[@class='pic']/@title").extract() item['price'] = response.xpath( "//span[@class='price_n']/text()").extract() item['link'] = response.xpath("//a[@class='pic']/@href").extract() item['comnum'] = response.xpath( "//a[@name='itemlist-review']/text()").extract() #print(item['name']) #print(item['price']) #print(item['link']) #print(item['comnum']) # 提取完item返回 yield item #关键部分 for i in range(2, 10): #通过总结的网址格式构造爬取的网址 url = 'http://category.dangdang.com/pg' + str( i) + '-cid4002203.html' #通过yield返回Request,并指定要爬取的网址和毁掉函数 #实现自动爬取 yield Request(url, callback=self.parse)
def parse(self, response): for _list in response.xpath('//div[@id="search_nature_rg"]/ul/li'): item = AutopjtItem() # 商品名称 item['name'] = _list.xpath('./a/@title').extract_first() # 商品链接 item['price'] = _list.xpath('./a/@href').extract_first() # 商品评论数 item['comnum'] = _list.xpath( './p[@class="star"]/a/text()').extract_first().strip('条评论') # 商品价格 item['link'] = _list.xpath( './p/span/text()').extract_first().strip('¥') yield item print(item['name']) # 判断是否有下一页 page = response.xpath( '//a[text()[contains(., "下一页")]]/@href').extract_first() if bool(page): fu = furl(response.url) fu_base = fu.copy().remove(path=True, args=True) next_page = fu_base.add(path=page).url # 构造完整URL yield scrapy.Request(url=next_page, callback=self.parse)