def badges(self):
		"""
			totalReviewBadge contains title and counts
			helpfulVotes contains the number of helpful votes
		"""
		try: 
			totalReviewBadge = self.reviewer_tag.find('div', class_='totalReviewBadge')

			try: 
				reviewerTitle = str(totalReviewBadge.find('div', class_='reviewerTitle').string)
			except AttributeError:
				reviewerTitle = ''

			counts = totalReviewBadge.find_all('span', class_='badgeText')
			try:
				review_count = TAutil.strip_comma(counts[0].string)
			except (AttributeError, IndexError):
				review_count = -1

			try:
				hotel_review_count = TAutil.strip_comma(counts[1].string)
			except (AttributeError, IndexError):
				hotel_review_count = -1
		except AttributeError:
			reviewerTitle = ''
			review_count = -1
			hotel_review_count = -1

		try:
			helpfulVotes = str(self.reviewer_tag.select('div.helpfulVotesBadge span.badgeText')[0].string)
			helpful_count = TAutil.strip_comma(helpfulVotes)
		except (AttributeError, IndexError):
			helpful_count = -1
		return reviewerTitle, review_count, hotel_review_count, helpful_count
    def parse_init(self, hotel_url, is_hotel_page):
        """
        select review parts of hotel html.
        click the first review title to expand the the review entry
        """
        review_strainer = SoupStrainer(id='REVIEWS')
        self.hotel_html = TAutil.open_page(hotel_url)
        self.hotel_ori_url = hotel_url
        self.soup = BeautifulSoup(self.hotel_html, parse_only = review_strainer)

        # open the first review
        if is_hotel_page:
            first_review = TAutil.wrap_tripadvisor(str(self.soup.find('div', class_='quote').a['href']))
            hotel_html = TAutil.open_page(first_review)
            self.soup = BeautifulSoup(hotel_html, parse_only = review_strainer) 
	def parse_init(self, city_page_url):
		"""
		initialize the soup
		"""
		city_html = TAutil.open_page(city_page_url)
		strainer = SoupStrainer(id="ACCOM_OVERVIEW")
		self.soup = BeautifulSoup(city_html, parse_only = strainer) 		
	def review_content(self):
		if self.tc:
			text = self.tc['body']
		else:
			text = self.reviewer_tag.find('div', class_='entry').p.strings
			text = ' '.join(text)
		return TAutil.process_text(text)
	def large_reviews(self, hotel):
		try:
			temp_str = str(hotel.find('span', class_='more').a.string).strip()
			number_reviews = TAutil.strip_comma(temp_str)
		except AttributeError as e:
			return False
		if number_reviews < self.min_reviews:
			return False
		else:
			return True
 def hotel_reviews(self):
     if not self.reviews:
         self.reviews = list()   
         while True:
             # test whether the page need ajax request
             temp_reviews = self.get_review()
             self.reviews.extend(temp_reviews)
             try:
                 next_page = TAutil.next_page(self.soup)
             except TypeError:
                 break
             self.parse_init(next_page, False)
     return self.reviews
	def hotel_urls(self):
		if not self.urls:
			self.urls = list()	
			while True:
				hotels = self.soup.find_all(id=re.compile(r'hotel_\d*'))
				temp_urls = [self.find_hotel_url(x) for x in hotels if self.large_reviews(x)]
				self.urls.extend(temp_urls)
				try:
					next_page =	TAutil.next_page(self.soup)
				except TypeError:
					break
				self.parse_init(next_page)
		return self.urls
    def get_review(self):
        temp_reviews = self.soup.find_all(class_='review')

        title_content = []
        if self.check_ajax():
            print(self.soup.find_all('div', class_='quote'))

            review_ids = [ReviewParser.ReviewParser(x).review_id for x in temp_reviews]
            print(review_ids)

            tc = TAutil.fetch_ajax_title_content(review_ids)
            return list(zip(temp_reviews, tc))
        else:
            return list(zip(temp_reviews, [None]*len(temp_reviews)))
 def trip_type(self):
     trip_type_s = self.soup.select('div.trip_type div.value')
     trip_type = [TAutil.strip_comma(x.string) for x in trip_type_s]
     if not trip_type:
         trip_type = [-1]*4
     return trip_type
 def rating_count(self):
     rating_number_s = self.soup.select('div.col2of2.composite span.compositeCount')
     rating_number = [TAutil.strip_comma(x.string) for x in rating_number_s]
     return rating_number
Example #11
0
	def find_hotel_url(self, hotel):
		return TAutil.wrap_tripadvisor(str(hotel.find(class_='property_title')['href']))
	def review_title(self):
		if self.tc:
			text = self.tc['name']
		else:
			text = str(self.reviewer_tag.find('div', class_='quote').string)
		return TAutil.process_text(text)
	def numHlp(self):
		try:
			return TAutil.strip_comma(self.reviewer_tag.find('span', class_='numHlpIn').string)
		except AttributeError:
			return 0