예제 #1
0
    def parse_resto(self, response):
        """SECOND PARSING : Given a restaurant, get each review url and get to parse it
            - Usually there are 10 comments per page
        """
        logger.warn(' > PARSING NEW REVIEW PAGE ({})'.format(self.resto_pg_nb))
        self.resto_pg_nb += 1

        # Get the list of reviews on the restaurant page

        ########################
        #### YOUR CODE HERE ####
        ########################
        
        urls_review = get_info.get_urls_review_in_resto(response)
        
        ########################
        ########################

        # For each review open the link and parse it into the parse_review method
        for url_review in urls_review:
             yield response.follow(url=url_review, callback=self.parse_review)

        
        ########################
        #### YOUR CODE HERE ####
        ########################
        
        next_page, next_page_number = get_info.get_urls_next_list_of_reviews(response)
        
        # Follow the page if we decide to
        if get_info.go_to_next_page(next_page, next_page_number, max_page=50):
            yield response.follow(next_page, callback=self.parse_resto)
예제 #2
0
    def parse(self, response):
        """MAIN PARSING : Start from a classical reastaurant page
            - Usually there are 30 restaurants per page
            - 
        """

        # Display a message in the console
        logger.warn(' > PARSING NEW MAIN PAGE OF RESTO ({})'.format(self.main_nb))
        self.main_nb += 1

        # Get the list of the 30 restaurants of the page
        restaurant_urls = get_info.get_urls_resto_in_main_search_page(response)
        
        # For each url : follow restaurant url to get the reviews
        for restaurant_url in restaurant_urls:
            logger.warn('> New restaurant detected : {}'.format(restaurant_url))
            yield response.follow(url=restaurant_url, callback=self.parse_resto)

        
        # Get next page information
        next_page, next_page_number = get_info.get_urls_next_list_of_restos(response)
        
        # Follow the page if we decide to
        if get_info.go_to_next_page(next_page, next_page_number, max_page=3):
            yield response.follow(next_page, callback=self.parse)
    def parse(self, response):
        """MAIN PARSING : Start from a classical reastaurant page
            - Usually there are 30 restaurants per page
            - 
        """

        # Display a message in the console
        logger.warn(' > PARSING NEW MAIN PAGE OF RESTO ({})'.format(
            self.main_nb))
        self.main_nb += 1

        # Get the list of the 30 restaurants of the page
        restaurant_urls = response.css(
            'div.wQjYiB7z > span > a ::attr(href)').extract()
        # For each url : follow restaurant url to get the reviews
        for restaurant_url in restaurant_urls:
            restaurant_url = 'https://www.tripadvisor.co.uk/' + restaurant_url
            #logger.warn('> New restaurant detected : {}'.format(restaurant_url))
            yield response.follow(url=restaurant_url,
                                  callback=self.parse_resto)

        # Get next page information
        next_page, next_page_number = get_info.get_urls_next_list_of_restos(
            response)

        # Follow the page if we decide to
        if get_info.go_to_next_page(next_page, next_page_number, max_page=10):
            yield response.follow(next_page, callback=self.parse)
    def parse_resto(self, response):
        """
        SECOND PARSING : Given a restaurant, get each review url and get to parse it
        - Usually there are 10 comments per page
        """
        logger.warn(' > PARSING NEW RESTO PAGE ({})'.format(self.resto_nb))
        self.resto_nb += 1

        ## Get Restaurant Info
        xpath = '//div[@class="header_links"]/a/text()'
        self.resto_price = response.xpath(xpath).extract()[0]
        self.resto_type = response.xpath(xpath).extract()[1:]

        # Get the list of reviews on the restaurant page
        urls_review = response.xpath(
            '//div[@class="quote"]//a/@href').extract()

        # For each review open the link and parse it into the parse_review method
        for url_review in urls_review:
            yield response.follow(url=url_review, callback=self.parse_review)

        next_page, next_page_number = get_info.get_urls_next_list_of_reviews(
            response)

        if get_info.go_to_next_page(next_page, next_page_number, max_page=10):
            yield response.follow(next_page, callback=self.parse_resto)
예제 #5
0
    def parse(self, response):
        """MAIN PARSING : Start from a classical restaurant page
            - Usually there are 30 restaurants per page
        """
        logger.warn('> PARSING NEW MAIN PAGE OF ARTICLES ({})'.format(self.main_nb))

        self.main_nb += 1

        # Get the list of the articles
        xpath = '//div[@class="quote"]'
        my_urls = response.xpath(xpath).css('::attr(href)').extract()
        for urls in my_urls:
            yield response.follow(url=urls, callback=self.parse_article)

        # Deal with next page
        css_locator = '//a[@class ="nav next ui_button primary"]/@href'
        next_page = response.css(css_locator).extract_first()
        try:
            next_page_number = next_page.split('=')[-1]
            next_page_number = int(next_page_number)
        except:
            next_page_number = self.main_nb + 1

        if get_info.go_to_next_page(next_page, next_page_number, self.max_page):
            yield response.follow(next_page, callback=self.parse)