def parse(self, response): sel = Selector(response) one_file = {} hotel_profile = {} review_obj = {} review_list = [] # Hotel Profile # Number of reviews per page num = len(sel.xpath('//*[@id="hotelCommentList"]/text()')) # Number of pages per hotel page_list = len(sel.xpath('/html/body/div/div/div/div[3]/div[1]/text()')) # page = sel.xpath('/html/body/div/div/div/div[3]/div[1]/a[' + str(page_list - 2) + ']/span/text()') page = str(sel.xpath('/html/body/div/div/div/div[3]/div[1]/a[11]/span/text()')).split(' ')[2].split('\'')[1] print page_list - 2 print page for flag in xrange(1, num + 1): # Review item = hotelReview() author = sel.xpath('//*[@id="hotelCommentList"]/li[' + str(flag) + ']/div[1]/p[2]/text()').extract() date = sel.xpath('//*[@id="hotelCommentList"]/li[' + str(flag) + ']/div[1]/p[3]/text()').extract() room_type = sel.xpath( '//*[@id="hotelCommentList"]/li[' + str(flag) + ']/div[2]/div/span[1]/text()').extract() total_overall_rating = sel.xpath( '//*[@id="hotelCommentList"]/li[' + str(flag) + ']/div[2]/div/span[3]/text()').extract() review = sel.xpath('//*[@id="hotelCommentList"]/li[' + str(flag) + ']/div[2]/p[3]/text()').extract() helpful = sel.xpath('//*[@id="hotelCommentList"]/li[' + str(flag) + ']/div[2]/p[1]/a/text()').extract() # print str(response.body).decode('GB2312').encode('utf8') filename = response.url.split('?')[1].split('&')[1].split('=')[1] # item is an object item['author'] = author[0].strip() item['date'] = date[0].strip() item['room_type'] = room_type[0].strip() item['total_overall_rating'] = total_overall_rating[0].strip() item['review'] = review[0].strip() item['helpful'] = helpful[0].strip() review_list.append(dict(item)) print len(review_list) for key in range(1, 3): link = response.url.replace(urlparse(response.url)[4].split('&')[4], 'currentPage=' + str(key)) print link yield Request( url="http://hotels.ctrip.com/Domestic/tool/AjaxGetHotelDetailComment.aspx?MasterHotelID=-1&hotel=441507&card=0&property=0¤tPage=1", callback=self.parse) # Write the file like the pipe con = json.dumps(review_list, ensure_ascii=False).encode('utf8') self.writeAppendFile(filename, con)
def parse(self, response): sel = Selector(response) item = hotelReview() aspect_rate = sel.xpath(".//*[@class='comment_detail']/text()").extract() print str(response.body).decode('GB2312').encode('utf8') filename = response.url.split('/')[-1].split('_')[0] item['content'] = json.dumps(aspect_rate , ensure_ascii=False).encode('utf8') print item with open(filename, 'a') as file: content = str(item['content']) file.write(content)
def parseReview(self, response): sel = Selector(response) review_list = [] hotel_overview = {} # hotel profile hotel_url = sel.xpath( '/html/body/div/div/div/div[2]/a[1]/@href').extract() hotel_overview['url'] = 'http://hotels.ctrip.com' + str( hotel_url[0].split('_')[0]) hotel_overview['total_overall_rating'] = \ sel.xpath('/html/body/div/div/div/div[1]/div[1]/span[2]/span/text()').extract()[0].strip() hotel_overview['per_recomment'] = \ sel.xpath('/html/body/div/div/div/div[1]/div[1]/span[3]/span/text()').extract()[0].strip() hotel_overview['for_biz'] = \ re.findall(r'\d+', sel.xpath('//*[@id="comment_statistics"]/a[1]/span/text()').extract()[0].strip())[0] hotel_overview['for_friend'] = \ re.findall(r'\d+', sel.xpath('//*[@id="comment_statistics"]/a[2]/span/text()').extract()[0].strip())[0] hotel_overview['for_couple'] = \ re.findall(r'\d+', sel.xpath('//*[@id="comment_statistics"]/a[3]/span/text()').extract()[0].strip())[0] hotel_overview['for_family'] = \ re.findall(r'\d+', sel.xpath('//*[@id="comment_statistics"]/a[4]/span/text()').extract()[0].strip())[0] hotel_overview['for_single'] = \ re.findall(r'\d+', sel.xpath('//*[@id="comment_statistics"]/a[5]/span/text()').extract()[0].strip())[0] hotel_overview['for_agent'] = \ re.findall(r'\d+', sel.xpath('//*[@id="comment_statistics"]/a[6]/span/text()').extract()[0].strip())[0] hotel_overview['for_others'] = \ re.findall(r'\d+', sel.xpath('//*[@id="comment_statistics"]/a[7]/span/text()').extract()[0].strip())[0] hotel_overview['avg_location'] = sel.xpath( '/html/body/div/div/div/div[1]/div[3]/p[1]/span/text()').extract( )[0].strip() hotel_overview['avg_facility'] = sel.xpath( '/html/body/div/div/div/div[1]/div[3]/p[2]/span/text()').extract( )[0].strip() hotel_overview['avg_service'] = sel.xpath( '/html/body/div/div/div/div[1]/div[3]/p[3]/span/text()').extract( )[0].strip() hotel_overview['avg_clean'] = sel.xpath( '/html/body/div/div/div/div[1]/div[3]/p[4]/span/text()').extract( )[0].strip() hotel_overview['all_comment'] = \ re.findall(r'\d+', sel.xpath('//*[@id="All_Commnet"]/text()').extract()[0].strip())[0] hotel_overview['recomment'] = re.findall( r'\d+', sel.xpath('//*[@id="Recomment"]/text()').extract()[0].strip())[0] hotel_overview['no_recomment'] = \ re.findall(r'\d+', sel.xpath('//*[@id="No_Recoment"]/text()').extract()[0].strip())[0] review_list.append(dict(hotel_overview)) try: # Number of reviews per page num = len(sel.xpath('/html/body/div/div/div/div[3]/text()')) # Hotel Profile for flag in xrange(1, num): # Review item = hotelReview() print flag author = sel.xpath('/html/body/div/div/div/div[3]/div[' + str(flag) + ']/div[1]/p[2]/text()').extract() user_type = sel.xpath('/html/body/div/div/div/div[3]/div[' + str(flag) + ']/div[1]/p[1]/@title').extract() date = sel.xpath('/html/body/div/div/div/div[3]/div[' + str(flag) + ']/p/span[3]/a/text()').extract() room_type = sel.xpath('/html/body/div/div/div/div[3]/div[' + str(flag) + ']/div[1]/p[3]/text()').extract() review_overall_rating = sel.xpath( '/html/body/div/div/div/div[3]/div[' + str(flag) + ']/p/span[2]/span/text()').extract() review_aspect_rating = sel.xpath( '/html/body/div/div/div/div[3]/div[' + str(flag) + ']/p/span[1]/@data-value').extract() helpful = sel.xpath('/html/body/div/div/div/div[3]/div[' + str(flag) + ']/div[2]/a/span/text()').extract() review = sel.xpath('/html/body/div/div/div/div[3]/div[' + str(flag) + ']/div[2]/text()').extract() # print str(response.body).decode('GB2312').encode('utf8') filename = response.url.split('?')[1].split('&')[1].split( '=')[1] print 'HIIIIIIIIIII' print filename # item is an object item['author'] = author[0].strip() item['user_type'] = user_type[0].strip() item['date'] = date[0].strip() item['room_type'] = room_type[0].strip() item['review_overall_rating'] = review_overall_rating[0].strip( ) # """ # "clean": ["", " ", "卫生:5", " ", "服务:5", " ", "设施:5", " ", "位置:5\r\n", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""] # """ item['location'] = re.findall( r'\d+', review_aspect_rating[0].strip().split(',')[0])[0] item['facility'] = re.findall( r'\d+', review_aspect_rating[0].strip().split(',')[1])[0] item['service'] = re.findall( r'\d+', review_aspect_rating[0].strip().split(',')[2])[0] item['clean'] = re.findall( r'\d+', review_aspect_rating[0].strip().split(',')[3])[0] item['review'] = review[0].strip() item['helpful'] = re.findall(r'\d+', helpful[0].strip())[0] review_list.append(dict(item)) print review_list # Write the file like the pipe con = json.dumps(review_list, ensure_ascii=False).encode('utf8') self.writeAppendFile(filename, con) except: log.msg("Review Error !!!!" + response.url, level=log.WARNING)
def parseReview(self, response): sel = Selector(response) review_list = [] hotel_overview = {} # hotel profile hotel_url = sel.xpath('/html/body/div/div/div/div[2]/a[1]/@href').extract() hotel_overview['url'] = 'http://hotels.ctrip.com' + str(hotel_url[0].split('_')[0]) hotel_overview['total_overall_rating'] = \ sel.xpath('/html/body/div/div/div/div[1]/div[1]/span[2]/span/text()').extract()[0].strip() hotel_overview['per_recomment'] = \ sel.xpath('/html/body/div/div/div/div[1]/div[1]/span[3]/span/text()').extract()[0].strip() hotel_overview['for_biz'] = \ re.findall(r'\d+', sel.xpath('//*[@id="comment_statistics"]/a[1]/span/text()').extract()[0].strip())[0] hotel_overview['for_friend'] = \ re.findall(r'\d+', sel.xpath('//*[@id="comment_statistics"]/a[2]/span/text()').extract()[0].strip())[0] hotel_overview['for_couple'] = \ re.findall(r'\d+', sel.xpath('//*[@id="comment_statistics"]/a[3]/span/text()').extract()[0].strip())[0] hotel_overview['for_family'] = \ re.findall(r'\d+', sel.xpath('//*[@id="comment_statistics"]/a[4]/span/text()').extract()[0].strip())[0] hotel_overview['for_single'] = \ re.findall(r'\d+', sel.xpath('//*[@id="comment_statistics"]/a[5]/span/text()').extract()[0].strip())[0] hotel_overview['for_agent'] = \ re.findall(r'\d+', sel.xpath('//*[@id="comment_statistics"]/a[6]/span/text()').extract()[0].strip())[0] hotel_overview['for_others'] = \ re.findall(r'\d+', sel.xpath('//*[@id="comment_statistics"]/a[7]/span/text()').extract()[0].strip())[0] hotel_overview['avg_location'] = sel.xpath('/html/body/div/div/div/div[1]/div[3]/p[1]/span/text()').extract()[ 0].strip() hotel_overview['avg_facility'] = sel.xpath('/html/body/div/div/div/div[1]/div[3]/p[2]/span/text()').extract()[ 0].strip() hotel_overview['avg_service'] = sel.xpath('/html/body/div/div/div/div[1]/div[3]/p[3]/span/text()').extract()[ 0].strip() hotel_overview['avg_clean'] = sel.xpath('/html/body/div/div/div/div[1]/div[3]/p[4]/span/text()').extract()[ 0].strip() hotel_overview['all_comment'] = \ re.findall(r'\d+', sel.xpath('//*[@id="All_Commnet"]/text()').extract()[0].strip())[0] hotel_overview['recomment'] = re.findall(r'\d+', sel.xpath('//*[@id="Recomment"]/text()').extract()[0].strip())[ 0] hotel_overview['no_recomment'] = \ re.findall(r'\d+', sel.xpath('//*[@id="No_Recoment"]/text()').extract()[0].strip())[0] review_list.append(dict(hotel_overview)) try: # Number of reviews per page num = len(sel.xpath('/html/body/div/div/div/div[3]/text()')) # Hotel Profile for flag in xrange(1, num): # Review item = hotelReview() print flag author = sel.xpath('/html/body/div/div/div/div[3]/div[' + str(flag) + ']/div[1]/p[2]/text()').extract() user_type = sel.xpath( '/html/body/div/div/div/div[3]/div[' + str(flag) + ']/div[1]/p[1]/@title').extract() date = sel.xpath('/html/body/div/div/div/div[3]/div[' + str(flag) + ']/p/span[3]/a/text()').extract() room_type = sel.xpath( '/html/body/div/div/div/div[3]/div[' + str(flag) + ']/div[1]/p[3]/text()').extract() review_overall_rating = sel.xpath( '/html/body/div/div/div/div[3]/div[' + str(flag) + ']/p/span[2]/span/text()').extract() review_aspect_rating = sel.xpath( '/html/body/div/div/div/div[3]/div[' + str(flag) + ']/p/span[1]/@data-value').extract() helpful = sel.xpath( '/html/body/div/div/div/div[3]/div[' + str(flag) + ']/div[2]/a/span/text()').extract() review = sel.xpath('/html/body/div/div/div/div[3]/div[' + str(flag) + ']/div[2]/text()').extract() # print str(response.body).decode('GB2312').encode('utf8') filename = response.url.split('?')[1].split('&')[1].split('=')[1] print 'HIIIIIIIIIII' print filename # item is an object item['author'] = author[0].strip() item['user_type'] = user_type[0].strip() item['date'] = date[0].strip() item['room_type'] = room_type[0].strip() item['review_overall_rating'] = review_overall_rating[0].strip() # """ # "clean": ["", " ", "卫生:5", " ", "服务:5", " ", "设施:5", " ", "位置:5\r\n", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""] # """ item['location'] = re.findall(r'\d+', review_aspect_rating[0].strip().split(',')[0])[0] item['facility'] = re.findall(r'\d+', review_aspect_rating[0].strip().split(',')[1])[0] item['service'] = re.findall(r'\d+', review_aspect_rating[0].strip().split(',')[2])[0] item['clean'] = re.findall(r'\d+', review_aspect_rating[0].strip().split(',')[3])[0] item['review'] = review[0].strip() item['helpful'] = re.findall(r'\d+', helpful[0].strip())[0] review_list.append(dict(item)) print review_list # Write the file like the pipe con = json.dumps(review_list, ensure_ascii=False).encode('utf8') self.writeAppendFile(filename, con) except: log.msg("Review Error !!!!" + response.url, level=log.WARNING)
def parse(self, response): sel = Selector(response) one_file = {} hotel_profile = {} review_obj = {} review_list = [] # Hotel Profile # Number of reviews per page num = len(sel.xpath('//*[@id="hotelCommentList"]/text()')) # Number of pages per hotel page_list = len( sel.xpath('/html/body/div/div/div/div[3]/div[1]/text()')) # page = sel.xpath('/html/body/div/div/div/div[3]/div[1]/a[' + str(page_list - 2) + ']/span/text()') page = str( sel.xpath('/html/body/div/div/div/div[3]/div[1]/a[11]/span/text()') ).split(' ')[2].split('\'')[1] print page_list - 2 print page for flag in xrange(1, num + 1): # Review item = hotelReview() author = sel.xpath('//*[@id="hotelCommentList"]/li[' + str(flag) + ']/div[1]/p[2]/text()').extract() date = sel.xpath('//*[@id="hotelCommentList"]/li[' + str(flag) + ']/div[1]/p[3]/text()').extract() room_type = sel.xpath('//*[@id="hotelCommentList"]/li[' + str(flag) + ']/div[2]/div/span[1]/text()').extract() total_overall_rating = sel.xpath( '//*[@id="hotelCommentList"]/li[' + str(flag) + ']/div[2]/div/span[3]/text()').extract() review = sel.xpath('//*[@id="hotelCommentList"]/li[' + str(flag) + ']/div[2]/p[3]/text()').extract() helpful = sel.xpath('//*[@id="hotelCommentList"]/li[' + str(flag) + ']/div[2]/p[1]/a/text()').extract() # print str(response.body).decode('GB2312').encode('utf8') filename = response.url.split('?')[1].split('&')[1].split('=')[1] # item is an object item['author'] = author[0].strip() item['date'] = date[0].strip() item['room_type'] = room_type[0].strip() item['total_overall_rating'] = total_overall_rating[0].strip() item['review'] = review[0].strip() item['helpful'] = helpful[0].strip() review_list.append(dict(item)) print len(review_list) for key in range(1, 3): link = response.url.replace( urlparse(response.url)[4].split('&')[4], 'currentPage=' + str(key)) print link yield Request( url= "http://hotels.ctrip.com/Domestic/tool/AjaxGetHotelDetailComment.aspx?MasterHotelID=-1&hotel=441507&card=0&property=0¤tPage=1", callback=self.parse) # Write the file like the pipe con = json.dumps(review_list, ensure_ascii=False).encode('utf8') self.writeAppendFile(filename, con)