def parse(self,response):
        item=TravelItem()

        try:
            item['spot_time'] = response.xpath("//div[@data-jump='costtime']/text()").get()
        except:
            item['spot_time'] = "未知".decode("utf-8")
            print("找不到参考时间")
        finally:
            try:
               spot_ct=response.xpath("//a[@class='commentNum']/strong/text()").get()
               item['spot_ct'] = re.findall("(\d*)条".decode("utf-8"), spot_ct)[0]
            except:
               item['spot_ct'] =0
               item['spot_gd']=0
               item['spot_bd']=0

               print("没有评论")
               yield item
            else:


               spot_comment_url="https://m.mafengwo.cn/poi/comment_"+re.findall("/(\d*)\.html",response.url)[0]+".html"
               print(spot_comment_url)
               yield scrapy.Request(
                  spot_comment_url,
                  callback=self.parse_spot_comment,
                  meta={"item":item},
                  flags = [1]
               )
コード例 #2
0
    def parse(self, response):
        travel_data = json.loads(response.body_as_unicode())['data']
        total_count = travel_data['totalCount']
        sight_list = travel_data['sightList']
        print(total_count)
        for sight_item in sight_list:
            # print(sight_item)
            item = TravelItem()
            item['name'] = sight_item['sightName']
            if 'star' in sight_item:
                item['level'] = sight_item['star']
            else:
                item['level'] = '暂无'
            item['province'] = sight_item['districts']
            item['price'] = sight_item['qunarPrice']
            item['sales'] = sight_item['saleCount']
            item['score'] = sight_item['score']
            item['loaction'] = sight_item['address']
            if 'intro' in sight_item:
                item['slogan'] = sight_item['intro']
            else:
                item['slogan'] = '暂无'
            point = sight_item['point'].split(',')
            item['longitude'] = point[0]
            item['latitude'] = point[1]
            yield item

        if (self.page * 15) < total_count:
            self.page += 1
            yield scrapy.Request(self.url + str(self.page),
                                 callback=self.parse)
    def parse(self, response):
        item = TravelItem()
        city_url = response.xpath("//ul[@class='clearfix']//li/div/a")
        for c in city_url:
           # 得到当前页面城市url

           city_list_url="http://www.mafengwo.cn"+c.xpath("./@href").get()
           print(city_list_url)
           # 得到城市图片url
           item['city_img']=c.xpath("./img/@data-original").get()
           #进入城市详情页
           yield scrapy.Request(
               city_list_url,
               callback=self.parse_city_list,
               meta={#传值
                     'item':copy.deepcopy(item),
                     # 禁止301重定向,防止页面跳到景点
                     # 'dont_redirect': True,
                     # 'handle_httpstatus_list': [301]
                     }
           )
        if len(self.city_url_list) != 0:
            next_url = self.city_url_list[0]
            del self.city_url_list[0]
            print("发送请求" + next_url)
            print(next_url)
            yield scrapy.Request(
                next_url,
                callback=self.parse
            )
    def parse(self, response):
        item = TravelItem()
        print("进入parse")
        if (response.status != 301):
            item['province'] = response.xpath(
                "//div[@class='crumb']/div[2]//div//span/a/text()").get()
            print(item['province'])
            province_id = response.xpath(
                "//div[@class='crumb']/div[2]//div//span/a/@href").get()
            # 省标号
            try:
                item['province_id'] = int(
                    re.findall("/(\d*)\.", province_id)[0])
            except:
                print("provinceid异常")
                print(province_id)
                print("type=" + type(re.findall("/(\d*)\.", province_id)[0]))
            # 城市
            item['city'] = response.xpath(
                "//div[@class='crumb']/div[3]//div//span/a/text()").get()
            city_id = response.xpath(
                "//div[@class='crumb']/div[3]//div//span/a/@href").get()
            # 城市标号

            item['city_id'] = int(re.findall("/(\d*)\.", city_id)[0])

            # yield self.parse_spot_list(item,spot_list_url)
            yield item
コード例 #5
0
 def parse(self, response):
     for i in range(10):
         item = TravelItem()
         self.id = self.id + 1
         item['id'] = self.id
         # print(type(item['id']))
         item['title'] = response.xpath(
             "/html/body/div[4]/div/div[2]/div/div[2]/a[" + str(i + 1) +
             "]/div/dl/dt/text()").extract_first()
         # item['preview'] = response.xpath("/html/body/div[4]/div/div[2]/div/div[2]/a["+str(i+1)+"]/div/span/img/@src").extract_first()
         #  链接不见了
         item['introduction'] = response.xpath(
             "/html/body/div[4]/div/div[2]/div/div[2]/a[" + str(i + 1) +
             "]/div/dl/dd[2]/text()").extract_first()
         # item['comment_num'] = \
         #     int(response.xpath("/html/body/div[4]/div/div[2]/div/div[2]/a["+str(i+1)+"]/div/ul/li[3]/i/text()").extract_first())
         # item['star_num'] = \
         #     int(response.xpath("/html/body/div[4]/div/div[2]/div/div[2]/a["+str(i+1)+"]/div/ul/li[2]/i/text()").extract_first())
         # re = response.xpath("/html/body/div[4]/div/div[2]/div/div[2]/a["+str(i+1)+"]/div/ul/li[1]/i/text()").extract_first()
         # item['collection_num'] = int(float(re[:-1])*10**4)
         url = "https://you.ctrip.com" + str(
             response.xpath("/html/body/div[4]/div/div[2]/div/div[2]/a[" +
                            str(i + 1) + "]/@href").extract_first())
         genres = response.xpath(
             "/html/body/div[4]/div/div[2]/div/div[2]/a[" + str(i + 1) +
             "]/div/span/span[1]/@class").extract()
         if genres == []:
             yield scrapy.Request(url=url,
                                  meta={'item': item},
                                  callback=self.new_parse1)
             # item['type1'] = '0'
         elif genres[0][-1:] == '4':  # 典藏版
             yield scrapy.Request(url=url,
                                  meta={'item': item},
                                  callback=self.new_parse4)
             # item['type1'] = genres[0]
         elif genres[0][-1:] == '1':  # 精华版
             yield scrapy.Request(url=url,
                                  meta={'item': item},
                                  callback=self.new_parse1)
             # item['type1'] = genres[0]
         elif genres[0][-1:] == '3':  # 实用版
             yield scrapy.Request(url=url,
                                  meta={'item': item},
                                  callback=self.new_parse1)
             # item['type1'] = genres[0]
         elif genres[0][-1:] == '2':  # 美图版
             yield scrapy.Request(url=url,
                                  meta={'item': item},
                                  callback=self.new_parse1)
             # item['type1'] = genres[0]
     if self.page <= 6:
         next = response.xpath(
             "/html/body/div[4]/div/div[2]/div/div[2]/div[2]/div/a[7]/@href"
         ).extract_first()
         next_url = response.urljoin(next)
         self.page += 1
         yield scrapy.Request(url=next_url, callback=self.parse)
コード例 #6
0
ファイル: gowhere.py プロジェクト: brilliantduck/hi
 def page_two(self, response):
     info = TravelItem()
     info['title'] = response.css(
         '.mp-description-detail .mp-description-view span').extract()
     info['onesentence'] = response.css(
         '.mp-description-onesentence').extract()
     info['content'] = response.css('.mp-charact-desc p').extract()
     info['price'] = response.css(
         '.mp-description-qunar-price em').extract()
     yield info
 def parse(self, response):
     item = TravelItem()
     city_url = response.xpath("//ul[@class='clearfix']//li/div/a")
     for c in city_url:
         item["city_id"]=int(re.findall("/(\d*)\.", c.xpath("./@href").get())[0]);
         yj_url="http://www.mafengwo.cn/yj/"+item["city_id"]
         yield scrapy.Request(
             yj_url,
             callback=self.parse_yj,
             meta={  # 传值
                 'item': copy.deepcopy(item)
             }
             ,
             dont_filter=True
         )
    def parse(self, response):
        item = TravelItem()
        # city_url = response.xpath("//ul[@class='clearfix']//li/div/a")

        city_list = response.xpath("//div[@class='cityPoi clearfix']/ul//li")
        for i in city_list:
            city_url = i.xpath("./a/@href").get()
            item["city"] = i.xpath(".//div[@class='name']/p[1]/text()").get()
            item["city_img"] = i.xpath(
                ".//div[@class='photo']/img/@data-original").get()
            item['city_id'] = int(re.findall("/mdd/(\d*)", city_url)[0])
            spot_url_page = "https://m.mafengwo.cn/jd/" + str(
                item["city_id"]) + "/gonglve.html?page={}&is_ajax=1"
            for i in range(1, 3):
                next_page = spot_url_page.format(i)
                yield scrapy.Request(next_page,
                                     callback=self.parse_spot_list,
                                     meta={"item": copy.deepcopy(item)},
                                     dont_filter=True)
    def parse(self, response):
        item = TravelItem()
        city_url = response.xpath("//ul[@class='clearfix']//li/div/a")
        for c in city_url:
            # 得到当前页面城市url

            city_list_url = "http://www.mafengwo.cn" + c.xpath("./@href").get()
            # print(city_list_url)
            # 得到城市图片url
            item['city_img'] = c.xpath("./img/@data-original").get()
            # 进入城市详情页
            # yield item
            yield scrapy.Request(
                city_list_url,
                callback=self.parse_city_list,
                meta={  # 传值
                    'item': copy.deepcopy(item)
                },
                dont_filter=True)
コード例 #10
0
    def parse(self, response):
        item = TravelItem()
        # city_url = response.xpath("//ul[@class='clearfix']//li/div/a")
        print response.text
        r = response.body.replace("\\", "")
        print r

        for i in range(0, len(re.findall("<a href=\"(.*?)\">", r))):
            city_url = re.findall("<a href=\"(.*?)\">", r)[i]
            item["city_img"] = re.findall("data-original=\"(.*?)\"", r)[i]
            # city=re.findall("<p class=\"t1\">(.*?)</p>",r)[i]
            # item["city"]=city.replace("u","\u").decode("utf-8").encode("utf-8")
            item["city"] = re.findall(
                r"<p class=\\\"t1\\\">(\S*?)<",
                response.text)[i].decode('unicode_escape')
            # print item["city"]
            item['city_id'] = int(re.findall("/mdd/(\d*)", city_url)[0])
            city_url_page = "https://m.mafengwo.cn/jd/" + item[
                'city_id'] + "/gonglve.html"
            yield scrapy.Request(city_url_page,
                                 callback=self.parse_city_list())
        # for c in city_url:
        #     # 得到当前页面城市url
        #     city_list_url = "http://www.mafengwo.cn" + c.xpath("./@href").get()
        #     # 得到城市图片url
        #
        #     item['city_img'] = c.xpath("./img/@data-original").get()
        #
        #     # 进入城市详情页
        #     yield scrapy.Request(
        #         city_list_url,
        #         callback=self.parse_city_list,
        #         meta={"item":copy.deepcopy(item)},
        #         dont_filter=True
        #     )
        if len(self.next_urls) != 0:
            formdata = {"mddid": "21536", "page": self.next_urls[0]}
            del self.next_urls[0]
            yield scrapy.FormRequest(url=self.start_urls[0],
                                     formdata=formdata,
                                     callback=self.parse,
                                     dont_filter=True)