def badges(self): """ totalReviewBadge contains title and counts helpfulVotes contains the number of helpful votes """ try: totalReviewBadge = self.reviewer_tag.find('div', class_='totalReviewBadge') try: reviewerTitle = str(totalReviewBadge.find('div', class_='reviewerTitle').string) except AttributeError: reviewerTitle = '' counts = totalReviewBadge.find_all('span', class_='badgeText') try: review_count = TAutil.strip_comma(counts[0].string) except (AttributeError, IndexError): review_count = -1 try: hotel_review_count = TAutil.strip_comma(counts[1].string) except (AttributeError, IndexError): hotel_review_count = -1 except AttributeError: reviewerTitle = '' review_count = -1 hotel_review_count = -1 try: helpfulVotes = str(self.reviewer_tag.select('div.helpfulVotesBadge span.badgeText')[0].string) helpful_count = TAutil.strip_comma(helpfulVotes) except (AttributeError, IndexError): helpful_count = -1 return reviewerTitle, review_count, hotel_review_count, helpful_count
def parse_init(self, hotel_url, is_hotel_page): """ select review parts of hotel html. click the first review title to expand the the review entry """ review_strainer = SoupStrainer(id='REVIEWS') self.hotel_html = TAutil.open_page(hotel_url) self.hotel_ori_url = hotel_url self.soup = BeautifulSoup(self.hotel_html, parse_only = review_strainer) # open the first review if is_hotel_page: first_review = TAutil.wrap_tripadvisor(str(self.soup.find('div', class_='quote').a['href'])) hotel_html = TAutil.open_page(first_review) self.soup = BeautifulSoup(hotel_html, parse_only = review_strainer)
def parse_init(self, city_page_url): """ initialize the soup """ city_html = TAutil.open_page(city_page_url) strainer = SoupStrainer(id="ACCOM_OVERVIEW") self.soup = BeautifulSoup(city_html, parse_only = strainer)
def review_content(self): if self.tc: text = self.tc['body'] else: text = self.reviewer_tag.find('div', class_='entry').p.strings text = ' '.join(text) return TAutil.process_text(text)
def large_reviews(self, hotel): try: temp_str = str(hotel.find('span', class_='more').a.string).strip() number_reviews = TAutil.strip_comma(temp_str) except AttributeError as e: return False if number_reviews < self.min_reviews: return False else: return True
def hotel_reviews(self): if not self.reviews: self.reviews = list() while True: # test whether the page need ajax request temp_reviews = self.get_review() self.reviews.extend(temp_reviews) try: next_page = TAutil.next_page(self.soup) except TypeError: break self.parse_init(next_page, False) return self.reviews
def hotel_urls(self): if not self.urls: self.urls = list() while True: hotels = self.soup.find_all(id=re.compile(r'hotel_\d*')) temp_urls = [self.find_hotel_url(x) for x in hotels if self.large_reviews(x)] self.urls.extend(temp_urls) try: next_page = TAutil.next_page(self.soup) except TypeError: break self.parse_init(next_page) return self.urls
def get_review(self): temp_reviews = self.soup.find_all(class_='review') title_content = [] if self.check_ajax(): print(self.soup.find_all('div', class_='quote')) review_ids = [ReviewParser.ReviewParser(x).review_id for x in temp_reviews] print(review_ids) tc = TAutil.fetch_ajax_title_content(review_ids) return list(zip(temp_reviews, tc)) else: return list(zip(temp_reviews, [None]*len(temp_reviews)))
def trip_type(self): trip_type_s = self.soup.select('div.trip_type div.value') trip_type = [TAutil.strip_comma(x.string) for x in trip_type_s] if not trip_type: trip_type = [-1]*4 return trip_type
def rating_count(self): rating_number_s = self.soup.select('div.col2of2.composite span.compositeCount') rating_number = [TAutil.strip_comma(x.string) for x in rating_number_s] return rating_number
def find_hotel_url(self, hotel): return TAutil.wrap_tripadvisor(str(hotel.find(class_='property_title')['href']))
def review_title(self): if self.tc: text = self.tc['name'] else: text = str(self.reviewer_tag.find('div', class_='quote').string) return TAutil.process_text(text)
def numHlp(self): try: return TAutil.strip_comma(self.reviewer_tag.find('span', class_='numHlpIn').string) except AttributeError: return 0