Beispiel #1
0
    def parse(self, response):
        sel = Selector(response)

        one_file = {}
        hotel_profile = {}
        review_obj = {}
        review_list = []
        # Hotel Profile

        # Number of reviews per page
        num = len(sel.xpath('//*[@id="hotelCommentList"]/text()'))
        # Number of pages per hotel
        page_list = len(sel.xpath('/html/body/div/div/div/div[3]/div[1]/text()'))

        # page = sel.xpath('/html/body/div/div/div/div[3]/div[1]/a[' + str(page_list - 2) + ']/span/text()')
        page = str(sel.xpath('/html/body/div/div/div/div[3]/div[1]/a[11]/span/text()')).split(' ')[2].split('\'')[1]
        print page_list - 2
        print page
        for flag in xrange(1, num + 1):
            # Review
            item = hotelReview()

            author = sel.xpath('//*[@id="hotelCommentList"]/li[' + str(flag) + ']/div[1]/p[2]/text()').extract()
            date = sel.xpath('//*[@id="hotelCommentList"]/li[' + str(flag) + ']/div[1]/p[3]/text()').extract()
            room_type = sel.xpath(
                '//*[@id="hotelCommentList"]/li[' + str(flag) + ']/div[2]/div/span[1]/text()').extract()
            total_overall_rating = sel.xpath(
                '//*[@id="hotelCommentList"]/li[' + str(flag) + ']/div[2]/div/span[3]/text()').extract()
            review = sel.xpath('//*[@id="hotelCommentList"]/li[' + str(flag) + ']/div[2]/p[3]/text()').extract()
            helpful = sel.xpath('//*[@id="hotelCommentList"]/li[' + str(flag) + ']/div[2]/p[1]/a/text()').extract()

            # print str(response.body).decode('GB2312').encode('utf8')

            filename = response.url.split('?')[1].split('&')[1].split('=')[1]

            # item is an object
            item['author'] = author[0].strip()
            item['date'] = date[0].strip()
            item['room_type'] = room_type[0].strip()
            item['total_overall_rating'] = total_overall_rating[0].strip()
            item['review'] = review[0].strip()
            item['helpful'] = helpful[0].strip()

            review_list.append(dict(item))

            print len(review_list)

        for key in range(1, 3):
            link = response.url.replace(urlparse(response.url)[4].split('&')[4], 'currentPage=' + str(key))
            print link
            yield Request(
                url="http://hotels.ctrip.com/Domestic/tool/AjaxGetHotelDetailComment.aspx?MasterHotelID=-1&hotel=441507&card=0&property=0&currentPage=1",
                callback=self.parse)

        # Write the file like the pipe
        con = json.dumps(review_list, ensure_ascii=False).encode('utf8')
        self.writeAppendFile(filename, con)
Beispiel #2
0
	def parse(self, response):
		sel = Selector(response)
		item = hotelReview()
		aspect_rate = sel.xpath(".//*[@class='comment_detail']/text()").extract()
		print str(response.body).decode('GB2312').encode('utf8')

		filename = response.url.split('/')[-1].split('_')[0]
		item['content'] = json.dumps(aspect_rate , ensure_ascii=False).encode('utf8')
		print item
		with open(filename, 'a') as file:
			content = str(item['content'])
			file.write(content)
Beispiel #3
0
    def parseReview(self, response):
        sel = Selector(response)
        review_list = []
        hotel_overview = {}
        # hotel profile

        hotel_url = sel.xpath(
            '/html/body/div/div/div/div[2]/a[1]/@href').extract()

        hotel_overview['url'] = 'http://hotels.ctrip.com' + str(
            hotel_url[0].split('_')[0])
        hotel_overview['total_overall_rating'] = \
            sel.xpath('/html/body/div/div/div/div[1]/div[1]/span[2]/span/text()').extract()[0].strip()

        hotel_overview['per_recomment'] = \
            sel.xpath('/html/body/div/div/div/div[1]/div[1]/span[3]/span/text()').extract()[0].strip()
        hotel_overview['for_biz'] = \
            re.findall(r'\d+', sel.xpath('//*[@id="comment_statistics"]/a[1]/span/text()').extract()[0].strip())[0]
        hotel_overview['for_friend'] = \
            re.findall(r'\d+', sel.xpath('//*[@id="comment_statistics"]/a[2]/span/text()').extract()[0].strip())[0]
        hotel_overview['for_couple'] = \
            re.findall(r'\d+', sel.xpath('//*[@id="comment_statistics"]/a[3]/span/text()').extract()[0].strip())[0]
        hotel_overview['for_family'] = \
            re.findall(r'\d+', sel.xpath('//*[@id="comment_statistics"]/a[4]/span/text()').extract()[0].strip())[0]
        hotel_overview['for_single'] = \
            re.findall(r'\d+', sel.xpath('//*[@id="comment_statistics"]/a[5]/span/text()').extract()[0].strip())[0]
        hotel_overview['for_agent'] = \
            re.findall(r'\d+', sel.xpath('//*[@id="comment_statistics"]/a[6]/span/text()').extract()[0].strip())[0]
        hotel_overview['for_others'] = \
            re.findall(r'\d+', sel.xpath('//*[@id="comment_statistics"]/a[7]/span/text()').extract()[0].strip())[0]

        hotel_overview['avg_location'] = sel.xpath(
            '/html/body/div/div/div/div[1]/div[3]/p[1]/span/text()').extract(
            )[0].strip()
        hotel_overview['avg_facility'] = sel.xpath(
            '/html/body/div/div/div/div[1]/div[3]/p[2]/span/text()').extract(
            )[0].strip()
        hotel_overview['avg_service'] = sel.xpath(
            '/html/body/div/div/div/div[1]/div[3]/p[3]/span/text()').extract(
            )[0].strip()
        hotel_overview['avg_clean'] = sel.xpath(
            '/html/body/div/div/div/div[1]/div[3]/p[4]/span/text()').extract(
            )[0].strip()
        hotel_overview['all_comment'] = \
            re.findall(r'\d+', sel.xpath('//*[@id="All_Commnet"]/text()').extract()[0].strip())[0]
        hotel_overview['recomment'] = re.findall(
            r'\d+',
            sel.xpath('//*[@id="Recomment"]/text()').extract()[0].strip())[0]
        hotel_overview['no_recomment'] = \
            re.findall(r'\d+', sel.xpath('//*[@id="No_Recoment"]/text()').extract()[0].strip())[0]

        review_list.append(dict(hotel_overview))

        try:
            # Number of reviews per page
            num = len(sel.xpath('/html/body/div/div/div/div[3]/text()'))
            # Hotel Profile

            for flag in xrange(1, num):
                # Review
                item = hotelReview()
                print flag
                author = sel.xpath('/html/body/div/div/div/div[3]/div[' +
                                   str(flag) +
                                   ']/div[1]/p[2]/text()').extract()
                user_type = sel.xpath('/html/body/div/div/div/div[3]/div[' +
                                      str(flag) +
                                      ']/div[1]/p[1]/@title').extract()
                date = sel.xpath('/html/body/div/div/div/div[3]/div[' +
                                 str(flag) + ']/p/span[3]/a/text()').extract()
                room_type = sel.xpath('/html/body/div/div/div/div[3]/div[' +
                                      str(flag) +
                                      ']/div[1]/p[3]/text()').extract()
                review_overall_rating = sel.xpath(
                    '/html/body/div/div/div/div[3]/div[' + str(flag) +
                    ']/p/span[2]/span/text()').extract()
                review_aspect_rating = sel.xpath(
                    '/html/body/div/div/div/div[3]/div[' + str(flag) +
                    ']/p/span[1]/@data-value').extract()
                helpful = sel.xpath('/html/body/div/div/div/div[3]/div[' +
                                    str(flag) +
                                    ']/div[2]/a/span/text()').extract()
                review = sel.xpath('/html/body/div/div/div/div[3]/div[' +
                                   str(flag) + ']/div[2]/text()').extract()

                # print str(response.body).decode('GB2312').encode('utf8')
                filename = response.url.split('?')[1].split('&')[1].split(
                    '=')[1]
                print 'HIIIIIIIIIII'
                print filename
                # item is an object
                item['author'] = author[0].strip()
                item['user_type'] = user_type[0].strip()
                item['date'] = date[0].strip()
                item['room_type'] = room_type[0].strip()
                item['review_overall_rating'] = review_overall_rating[0].strip(
                )
                # """
                # "clean": ["", " ", "卫生:5", " ", "服务:5", " ", "设施:5", " ", "位置:5\r\n", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""]
                # """
                item['location'] = re.findall(
                    r'\d+', review_aspect_rating[0].strip().split(',')[0])[0]
                item['facility'] = re.findall(
                    r'\d+', review_aspect_rating[0].strip().split(',')[1])[0]
                item['service'] = re.findall(
                    r'\d+', review_aspect_rating[0].strip().split(',')[2])[0]
                item['clean'] = re.findall(
                    r'\d+', review_aspect_rating[0].strip().split(',')[3])[0]
                item['review'] = review[0].strip()
                item['helpful'] = re.findall(r'\d+', helpful[0].strip())[0]

                review_list.append(dict(item))

                print review_list
            # Write the file like the pipe
            con = json.dumps(review_list, ensure_ascii=False).encode('utf8')
            self.writeAppendFile(filename, con)
        except:
            log.msg("Review Error !!!!" + response.url, level=log.WARNING)
    def parseReview(self, response):
        sel = Selector(response)
        review_list = []
        hotel_overview = {}
        # hotel profile

        hotel_url = sel.xpath('/html/body/div/div/div/div[2]/a[1]/@href').extract()

        hotel_overview['url'] = 'http://hotels.ctrip.com' + str(hotel_url[0].split('_')[0])
        hotel_overview['total_overall_rating'] = \
            sel.xpath('/html/body/div/div/div/div[1]/div[1]/span[2]/span/text()').extract()[0].strip()

        hotel_overview['per_recomment'] = \
            sel.xpath('/html/body/div/div/div/div[1]/div[1]/span[3]/span/text()').extract()[0].strip()
        hotel_overview['for_biz'] = \
            re.findall(r'\d+', sel.xpath('//*[@id="comment_statistics"]/a[1]/span/text()').extract()[0].strip())[0]
        hotel_overview['for_friend'] = \
            re.findall(r'\d+', sel.xpath('//*[@id="comment_statistics"]/a[2]/span/text()').extract()[0].strip())[0]
        hotel_overview['for_couple'] = \
            re.findall(r'\d+', sel.xpath('//*[@id="comment_statistics"]/a[3]/span/text()').extract()[0].strip())[0]
        hotel_overview['for_family'] = \
            re.findall(r'\d+', sel.xpath('//*[@id="comment_statistics"]/a[4]/span/text()').extract()[0].strip())[0]
        hotel_overview['for_single'] = \
            re.findall(r'\d+', sel.xpath('//*[@id="comment_statistics"]/a[5]/span/text()').extract()[0].strip())[0]
        hotel_overview['for_agent'] = \
            re.findall(r'\d+', sel.xpath('//*[@id="comment_statistics"]/a[6]/span/text()').extract()[0].strip())[0]
        hotel_overview['for_others'] = \
            re.findall(r'\d+', sel.xpath('//*[@id="comment_statistics"]/a[7]/span/text()').extract()[0].strip())[0]

        hotel_overview['avg_location'] = sel.xpath('/html/body/div/div/div/div[1]/div[3]/p[1]/span/text()').extract()[
            0].strip()
        hotel_overview['avg_facility'] = sel.xpath('/html/body/div/div/div/div[1]/div[3]/p[2]/span/text()').extract()[
            0].strip()
        hotel_overview['avg_service'] = sel.xpath('/html/body/div/div/div/div[1]/div[3]/p[3]/span/text()').extract()[
            0].strip()
        hotel_overview['avg_clean'] = sel.xpath('/html/body/div/div/div/div[1]/div[3]/p[4]/span/text()').extract()[
            0].strip()
        hotel_overview['all_comment'] = \
            re.findall(r'\d+', sel.xpath('//*[@id="All_Commnet"]/text()').extract()[0].strip())[0]
        hotel_overview['recomment'] = re.findall(r'\d+', sel.xpath('//*[@id="Recomment"]/text()').extract()[0].strip())[
            0]
        hotel_overview['no_recomment'] = \
            re.findall(r'\d+', sel.xpath('//*[@id="No_Recoment"]/text()').extract()[0].strip())[0]

        review_list.append(dict(hotel_overview))

        try:
            # Number of reviews per page
            num = len(sel.xpath('/html/body/div/div/div/div[3]/text()'))
            # Hotel Profile

            for flag in xrange(1, num):
                # Review
                item = hotelReview()
                print flag
                author = sel.xpath('/html/body/div/div/div/div[3]/div[' + str(flag) + ']/div[1]/p[2]/text()').extract()
                user_type = sel.xpath(
                    '/html/body/div/div/div/div[3]/div[' + str(flag) + ']/div[1]/p[1]/@title').extract()
                date = sel.xpath('/html/body/div/div/div/div[3]/div[' + str(flag) + ']/p/span[3]/a/text()').extract()
                room_type = sel.xpath(
                    '/html/body/div/div/div/div[3]/div[' + str(flag) + ']/div[1]/p[3]/text()').extract()
                review_overall_rating = sel.xpath(
                    '/html/body/div/div/div/div[3]/div[' + str(flag) + ']/p/span[2]/span/text()').extract()
                review_aspect_rating = sel.xpath(
                    '/html/body/div/div/div/div[3]/div[' + str(flag) + ']/p/span[1]/@data-value').extract()
                helpful = sel.xpath(
                    '/html/body/div/div/div/div[3]/div[' + str(flag) + ']/div[2]/a/span/text()').extract()
                review = sel.xpath('/html/body/div/div/div/div[3]/div[' + str(flag) + ']/div[2]/text()').extract()

                # print str(response.body).decode('GB2312').encode('utf8')
                filename = response.url.split('?')[1].split('&')[1].split('=')[1]
                print 'HIIIIIIIIIII'
                print filename
                # item is an object
                item['author'] = author[0].strip()
                item['user_type'] = user_type[0].strip()
                item['date'] = date[0].strip()
                item['room_type'] = room_type[0].strip()
                item['review_overall_rating'] = review_overall_rating[0].strip()
                # """
                # "clean": ["", " ", "卫生:5", " ", "服务:5", " ", "设施:5", " ", "位置:5\r\n", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""]
                # """
                item['location'] = re.findall(r'\d+', review_aspect_rating[0].strip().split(',')[0])[0]
                item['facility'] = re.findall(r'\d+', review_aspect_rating[0].strip().split(',')[1])[0]
                item['service'] = re.findall(r'\d+', review_aspect_rating[0].strip().split(',')[2])[0]
                item['clean'] = re.findall(r'\d+', review_aspect_rating[0].strip().split(',')[3])[0]
                item['review'] = review[0].strip()
                item['helpful'] = re.findall(r'\d+', helpful[0].strip())[0]

                review_list.append(dict(item))

                print review_list
            # Write the file like the pipe
            con = json.dumps(review_list, ensure_ascii=False).encode('utf8')
            self.writeAppendFile(filename, con)
        except:
            log.msg("Review Error !!!!" + response.url, level=log.WARNING)
Beispiel #5
0
    def parse(self, response):
        sel = Selector(response)

        one_file = {}
        hotel_profile = {}
        review_obj = {}
        review_list = []
        # Hotel Profile

        # Number of reviews per page
        num = len(sel.xpath('//*[@id="hotelCommentList"]/text()'))
        # Number of pages per hotel
        page_list = len(
            sel.xpath('/html/body/div/div/div/div[3]/div[1]/text()'))

        # page = sel.xpath('/html/body/div/div/div/div[3]/div[1]/a[' + str(page_list - 2) + ']/span/text()')
        page = str(
            sel.xpath('/html/body/div/div/div/div[3]/div[1]/a[11]/span/text()')
        ).split(' ')[2].split('\'')[1]
        print page_list - 2
        print page
        for flag in xrange(1, num + 1):
            # Review
            item = hotelReview()

            author = sel.xpath('//*[@id="hotelCommentList"]/li[' + str(flag) +
                               ']/div[1]/p[2]/text()').extract()
            date = sel.xpath('//*[@id="hotelCommentList"]/li[' + str(flag) +
                             ']/div[1]/p[3]/text()').extract()
            room_type = sel.xpath('//*[@id="hotelCommentList"]/li[' +
                                  str(flag) +
                                  ']/div[2]/div/span[1]/text()').extract()
            total_overall_rating = sel.xpath(
                '//*[@id="hotelCommentList"]/li[' + str(flag) +
                ']/div[2]/div/span[3]/text()').extract()
            review = sel.xpath('//*[@id="hotelCommentList"]/li[' + str(flag) +
                               ']/div[2]/p[3]/text()').extract()
            helpful = sel.xpath('//*[@id="hotelCommentList"]/li[' + str(flag) +
                                ']/div[2]/p[1]/a/text()').extract()

            # print str(response.body).decode('GB2312').encode('utf8')

            filename = response.url.split('?')[1].split('&')[1].split('=')[1]

            # item is an object
            item['author'] = author[0].strip()
            item['date'] = date[0].strip()
            item['room_type'] = room_type[0].strip()
            item['total_overall_rating'] = total_overall_rating[0].strip()
            item['review'] = review[0].strip()
            item['helpful'] = helpful[0].strip()

            review_list.append(dict(item))

            print len(review_list)

        for key in range(1, 3):
            link = response.url.replace(
                urlparse(response.url)[4].split('&')[4],
                'currentPage=' + str(key))
            print link
            yield Request(
                url=
                "http://hotels.ctrip.com/Domestic/tool/AjaxGetHotelDetailComment.aspx?MasterHotelID=-1&hotel=441507&card=0&property=0&currentPage=1",
                callback=self.parse)

        # Write the file like the pipe
        con = json.dumps(review_list, ensure_ascii=False).encode('utf8')
        self.writeAppendFile(filename, con)