def parse_resto(self, response): """SECOND PARSING : Given a restaurant, get each review url and get to parse it - Usually there are 10 comments per page """ logger.warn(' > PARSING NEW REVIEW PAGE ({})'.format(self.resto_pg_nb)) self.resto_pg_nb += 1 # Get the list of reviews on the restaurant page ######################## #### YOUR CODE HERE #### ######################## urls_review = get_info.get_urls_review_in_resto(response) ######################## ######################## # For each review open the link and parse it into the parse_review method for url_review in urls_review: yield response.follow(url=url_review, callback=self.parse_review) ######################## #### YOUR CODE HERE #### ######################## next_page, next_page_number = get_info.get_urls_next_list_of_reviews(response) # Follow the page if we decide to if get_info.go_to_next_page(next_page, next_page_number, max_page=50): yield response.follow(next_page, callback=self.parse_resto)
def parse(self, response): """MAIN PARSING : Start from a classical reastaurant page - Usually there are 30 restaurants per page - """ # Display a message in the console logger.warn(' > PARSING NEW MAIN PAGE OF RESTO ({})'.format(self.main_nb)) self.main_nb += 1 # Get the list of the 30 restaurants of the page restaurant_urls = get_info.get_urls_resto_in_main_search_page(response) # For each url : follow restaurant url to get the reviews for restaurant_url in restaurant_urls: logger.warn('> New restaurant detected : {}'.format(restaurant_url)) yield response.follow(url=restaurant_url, callback=self.parse_resto) # Get next page information next_page, next_page_number = get_info.get_urls_next_list_of_restos(response) # Follow the page if we decide to if get_info.go_to_next_page(next_page, next_page_number, max_page=3): yield response.follow(next_page, callback=self.parse)
def parse(self, response): """MAIN PARSING : Start from a classical reastaurant page - Usually there are 30 restaurants per page - """ # Display a message in the console logger.warn(' > PARSING NEW MAIN PAGE OF RESTO ({})'.format( self.main_nb)) self.main_nb += 1 # Get the list of the 30 restaurants of the page restaurant_urls = response.css( 'div.wQjYiB7z > span > a ::attr(href)').extract() # For each url : follow restaurant url to get the reviews for restaurant_url in restaurant_urls: restaurant_url = 'https://www.tripadvisor.co.uk/' + restaurant_url #logger.warn('> New restaurant detected : {}'.format(restaurant_url)) yield response.follow(url=restaurant_url, callback=self.parse_resto) # Get next page information next_page, next_page_number = get_info.get_urls_next_list_of_restos( response) # Follow the page if we decide to if get_info.go_to_next_page(next_page, next_page_number, max_page=10): yield response.follow(next_page, callback=self.parse)
def parse_resto(self, response): """ SECOND PARSING : Given a restaurant, get each review url and get to parse it - Usually there are 10 comments per page """ logger.warn(' > PARSING NEW RESTO PAGE ({})'.format(self.resto_nb)) self.resto_nb += 1 ## Get Restaurant Info xpath = '//div[@class="header_links"]/a/text()' self.resto_price = response.xpath(xpath).extract()[0] self.resto_type = response.xpath(xpath).extract()[1:] # Get the list of reviews on the restaurant page urls_review = response.xpath( '//div[@class="quote"]//a/@href').extract() # For each review open the link and parse it into the parse_review method for url_review in urls_review: yield response.follow(url=url_review, callback=self.parse_review) next_page, next_page_number = get_info.get_urls_next_list_of_reviews( response) if get_info.go_to_next_page(next_page, next_page_number, max_page=10): yield response.follow(next_page, callback=self.parse_resto)
def parse(self, response): """MAIN PARSING : Start from a classical restaurant page - Usually there are 30 restaurants per page """ logger.warn('> PARSING NEW MAIN PAGE OF ARTICLES ({})'.format(self.main_nb)) self.main_nb += 1 # Get the list of the articles xpath = '//div[@class="quote"]' my_urls = response.xpath(xpath).css('::attr(href)').extract() for urls in my_urls: yield response.follow(url=urls, callback=self.parse_article) # Deal with next page css_locator = '//a[@class ="nav next ui_button primary"]/@href' next_page = response.css(css_locator).extract_first() try: next_page_number = next_page.split('=')[-1] next_page_number = int(next_page_number) except: next_page_number = self.main_nb + 1 if get_info.go_to_next_page(next_page, next_page_number, self.max_page): yield response.follow(next_page, callback=self.parse)