def parse(self,response): item=TravelItem() try: item['spot_time'] = response.xpath("//div[@data-jump='costtime']/text()").get() except: item['spot_time'] = "未知".decode("utf-8") print("找不到参考时间") finally: try: spot_ct=response.xpath("//a[@class='commentNum']/strong/text()").get() item['spot_ct'] = re.findall("(\d*)条".decode("utf-8"), spot_ct)[0] except: item['spot_ct'] =0 item['spot_gd']=0 item['spot_bd']=0 print("没有评论") yield item else: spot_comment_url="https://m.mafengwo.cn/poi/comment_"+re.findall("/(\d*)\.html",response.url)[0]+".html" print(spot_comment_url) yield scrapy.Request( spot_comment_url, callback=self.parse_spot_comment, meta={"item":item}, flags = [1] )
def parse(self, response): travel_data = json.loads(response.body_as_unicode())['data'] total_count = travel_data['totalCount'] sight_list = travel_data['sightList'] print(total_count) for sight_item in sight_list: # print(sight_item) item = TravelItem() item['name'] = sight_item['sightName'] if 'star' in sight_item: item['level'] = sight_item['star'] else: item['level'] = '暂无' item['province'] = sight_item['districts'] item['price'] = sight_item['qunarPrice'] item['sales'] = sight_item['saleCount'] item['score'] = sight_item['score'] item['loaction'] = sight_item['address'] if 'intro' in sight_item: item['slogan'] = sight_item['intro'] else: item['slogan'] = '暂无' point = sight_item['point'].split(',') item['longitude'] = point[0] item['latitude'] = point[1] yield item if (self.page * 15) < total_count: self.page += 1 yield scrapy.Request(self.url + str(self.page), callback=self.parse)
def parse(self, response): item = TravelItem() city_url = response.xpath("//ul[@class='clearfix']//li/div/a") for c in city_url: # 得到当前页面城市url city_list_url="http://www.mafengwo.cn"+c.xpath("./@href").get() print(city_list_url) # 得到城市图片url item['city_img']=c.xpath("./img/@data-original").get() #进入城市详情页 yield scrapy.Request( city_list_url, callback=self.parse_city_list, meta={#传值 'item':copy.deepcopy(item), # 禁止301重定向,防止页面跳到景点 # 'dont_redirect': True, # 'handle_httpstatus_list': [301] } ) if len(self.city_url_list) != 0: next_url = self.city_url_list[0] del self.city_url_list[0] print("发送请求" + next_url) print(next_url) yield scrapy.Request( next_url, callback=self.parse )
def parse(self, response): item = TravelItem() print("进入parse") if (response.status != 301): item['province'] = response.xpath( "//div[@class='crumb']/div[2]//div//span/a/text()").get() print(item['province']) province_id = response.xpath( "//div[@class='crumb']/div[2]//div//span/a/@href").get() # 省标号 try: item['province_id'] = int( re.findall("/(\d*)\.", province_id)[0]) except: print("provinceid异常") print(province_id) print("type=" + type(re.findall("/(\d*)\.", province_id)[0])) # 城市 item['city'] = response.xpath( "//div[@class='crumb']/div[3]//div//span/a/text()").get() city_id = response.xpath( "//div[@class='crumb']/div[3]//div//span/a/@href").get() # 城市标号 item['city_id'] = int(re.findall("/(\d*)\.", city_id)[0]) # yield self.parse_spot_list(item,spot_list_url) yield item
def parse(self, response): for i in range(10): item = TravelItem() self.id = self.id + 1 item['id'] = self.id # print(type(item['id'])) item['title'] = response.xpath( "/html/body/div[4]/div/div[2]/div/div[2]/a[" + str(i + 1) + "]/div/dl/dt/text()").extract_first() # item['preview'] = response.xpath("/html/body/div[4]/div/div[2]/div/div[2]/a["+str(i+1)+"]/div/span/img/@src").extract_first() # 链接不见了 item['introduction'] = response.xpath( "/html/body/div[4]/div/div[2]/div/div[2]/a[" + str(i + 1) + "]/div/dl/dd[2]/text()").extract_first() # item['comment_num'] = \ # int(response.xpath("/html/body/div[4]/div/div[2]/div/div[2]/a["+str(i+1)+"]/div/ul/li[3]/i/text()").extract_first()) # item['star_num'] = \ # int(response.xpath("/html/body/div[4]/div/div[2]/div/div[2]/a["+str(i+1)+"]/div/ul/li[2]/i/text()").extract_first()) # re = response.xpath("/html/body/div[4]/div/div[2]/div/div[2]/a["+str(i+1)+"]/div/ul/li[1]/i/text()").extract_first() # item['collection_num'] = int(float(re[:-1])*10**4) url = "https://you.ctrip.com" + str( response.xpath("/html/body/div[4]/div/div[2]/div/div[2]/a[" + str(i + 1) + "]/@href").extract_first()) genres = response.xpath( "/html/body/div[4]/div/div[2]/div/div[2]/a[" + str(i + 1) + "]/div/span/span[1]/@class").extract() if genres == []: yield scrapy.Request(url=url, meta={'item': item}, callback=self.new_parse1) # item['type1'] = '0' elif genres[0][-1:] == '4': # 典藏版 yield scrapy.Request(url=url, meta={'item': item}, callback=self.new_parse4) # item['type1'] = genres[0] elif genres[0][-1:] == '1': # 精华版 yield scrapy.Request(url=url, meta={'item': item}, callback=self.new_parse1) # item['type1'] = genres[0] elif genres[0][-1:] == '3': # 实用版 yield scrapy.Request(url=url, meta={'item': item}, callback=self.new_parse1) # item['type1'] = genres[0] elif genres[0][-1:] == '2': # 美图版 yield scrapy.Request(url=url, meta={'item': item}, callback=self.new_parse1) # item['type1'] = genres[0] if self.page <= 6: next = response.xpath( "/html/body/div[4]/div/div[2]/div/div[2]/div[2]/div/a[7]/@href" ).extract_first() next_url = response.urljoin(next) self.page += 1 yield scrapy.Request(url=next_url, callback=self.parse)
def page_two(self, response): info = TravelItem() info['title'] = response.css( '.mp-description-detail .mp-description-view span').extract() info['onesentence'] = response.css( '.mp-description-onesentence').extract() info['content'] = response.css('.mp-charact-desc p').extract() info['price'] = response.css( '.mp-description-qunar-price em').extract() yield info
def parse(self, response): item = TravelItem() city_url = response.xpath("//ul[@class='clearfix']//li/div/a") for c in city_url: item["city_id"]=int(re.findall("/(\d*)\.", c.xpath("./@href").get())[0]); yj_url="http://www.mafengwo.cn/yj/"+item["city_id"] yield scrapy.Request( yj_url, callback=self.parse_yj, meta={ # 传值 'item': copy.deepcopy(item) } , dont_filter=True )
def parse(self, response): item = TravelItem() # city_url = response.xpath("//ul[@class='clearfix']//li/div/a") city_list = response.xpath("//div[@class='cityPoi clearfix']/ul//li") for i in city_list: city_url = i.xpath("./a/@href").get() item["city"] = i.xpath(".//div[@class='name']/p[1]/text()").get() item["city_img"] = i.xpath( ".//div[@class='photo']/img/@data-original").get() item['city_id'] = int(re.findall("/mdd/(\d*)", city_url)[0]) spot_url_page = "https://m.mafengwo.cn/jd/" + str( item["city_id"]) + "/gonglve.html?page={}&is_ajax=1" for i in range(1, 3): next_page = spot_url_page.format(i) yield scrapy.Request(next_page, callback=self.parse_spot_list, meta={"item": copy.deepcopy(item)}, dont_filter=True)
def parse(self, response): item = TravelItem() city_url = response.xpath("//ul[@class='clearfix']//li/div/a") for c in city_url: # 得到当前页面城市url city_list_url = "http://www.mafengwo.cn" + c.xpath("./@href").get() # print(city_list_url) # 得到城市图片url item['city_img'] = c.xpath("./img/@data-original").get() # 进入城市详情页 # yield item yield scrapy.Request( city_list_url, callback=self.parse_city_list, meta={ # 传值 'item': copy.deepcopy(item) }, dont_filter=True)
def parse(self, response): item = TravelItem() # city_url = response.xpath("//ul[@class='clearfix']//li/div/a") print response.text r = response.body.replace("\\", "") print r for i in range(0, len(re.findall("<a href=\"(.*?)\">", r))): city_url = re.findall("<a href=\"(.*?)\">", r)[i] item["city_img"] = re.findall("data-original=\"(.*?)\"", r)[i] # city=re.findall("<p class=\"t1\">(.*?)</p>",r)[i] # item["city"]=city.replace("u","\u").decode("utf-8").encode("utf-8") item["city"] = re.findall( r"<p class=\\\"t1\\\">(\S*?)<", response.text)[i].decode('unicode_escape') # print item["city"] item['city_id'] = int(re.findall("/mdd/(\d*)", city_url)[0]) city_url_page = "https://m.mafengwo.cn/jd/" + item[ 'city_id'] + "/gonglve.html" yield scrapy.Request(city_url_page, callback=self.parse_city_list()) # for c in city_url: # # 得到当前页面城市url # city_list_url = "http://www.mafengwo.cn" + c.xpath("./@href").get() # # 得到城市图片url # # item['city_img'] = c.xpath("./img/@data-original").get() # # # 进入城市详情页 # yield scrapy.Request( # city_list_url, # callback=self.parse_city_list, # meta={"item":copy.deepcopy(item)}, # dont_filter=True # ) if len(self.next_urls) != 0: formdata = {"mddid": "21536", "page": self.next_urls[0]} del self.next_urls[0] yield scrapy.FormRequest(url=self.start_urls[0], formdata=formdata, callback=self.parse, dont_filter=True)