예제 #1
0
    def parse_review(self, response):
        next_page_xpath = "(//*[@rel='next']/@href)[1]"
        default_rating_xpath = './/reevoo-score/@data-score'

        product = response.meta['product']
        reviews = response.xpath('//article[contains(@id,"review_")]')

        if not reviews:
            return

        # From observation, at least currys.co.uk uses a different format to present review rating
        rating_xpath = response.meta.get('rating_xpath', '')
        if not rating_xpath:
            rating_xpath = default_rating_xpath

        last_user_review = incremental_utils.get_latest_user_review_date_by_sii(
            self.mysql_manager, self.spider_conf['source_id'],
            product["source_internal_id"])

        for review in reviews:
            user_review = ReviewItem()
            date = self.extract(
                review.xpath(
                    './/span[contains(@class, "date_publish")]/text()'))
            if date:
                user_review['TestDateText'] = date_format(date, '')
                current_user_review = dateparser.parse(
                    user_review['TestDateText'], date_formats=['%Y-%m-%d'])
                if current_user_review < last_user_review:
                    return

            user_review['DBaseCategoryName'] = "USER"
            user_review['ProductName'] = product['ProductName']
            user_review['TestUrl'] = product['TestUrl']
            user_review['SourceTestRating'] = self.extract(
                review.xpath(rating_xpath))
            user_review['Author'] = self.extract(
                review.xpath('.//h4[@class="attribution-name"]/text()'))
            user_review['TestPros'] = self.extract_all(
                review.xpath('.//dd[@class="pros"]/text()'))
            user_review['TestCons'] = self.extract_all(
                review.xpath('.//dd[@class="cons"]/text()'))
            user_review['source_internal_id'] = product['source_internal_id']

            # All reviews after first empty review are empty
            if user_review['TestPros'] or user_review['TestCons']:
                yield user_review
            else:
                return

        next_page_url = self.extract(response.xpath(next_page_xpath))
        if next_page_url:
            next_page_url = get_full_url(response, next_page_url)
            request = Request(next_page_url,
                              callback=self.parse_review,
                              meta=response.meta)
            yield request
예제 #2
0
    def parse_reviews(self, response):
        product = response.meta['product']
        rating_xpath = ".//*[@class='review--header-rating']/text()"
        title_xpath = ".//h3[contains(@class, 'review--header-title')]/text()"
        summary_xpath = ".//div[contains(@class, 'review--description')]//text()"
        header_xpath = ".//div[@class='review--header-review-info']//text()"
        date_xpath =  ".//div[@class='review--header-review-info']/time/@datetime"

        pros_xpath = ".//li[contains(@class, 'pros-and-cons-pro')]//*[@class!='is-visually-hidden']/text()"
        cons_xpath = ".//li[contains(@class, 'pros-and-cons-con')]//*[@class!='is-visually-hidden']/text()"

        next_page_xpath = "//a[@rel='next']/@href"
        reviews = response.xpath("//li[contains(@class, 'reviews__list-item')]")

        last_user_review = incremental_utils.get_latest_user_review_date_by_sii(
            self.mysql_manager, self.spider_conf['source_id'],
            product["source_internal_id"]
        )

        for review in reviews:
            date = self.extract_xpath(review, date_xpath)
            if date:
                date = date_format(date, '')
                current_user_review = dateparser.parse(date,
                                                       date_formats=['%Y-%m-%d'])
                if current_user_review < last_user_review:
                    return

            title = self.extract_xpath(review, title_xpath)
            rating = self.extract_xpath(review, rating_xpath)
            splitted = rating.split('/')
            if splitted:
                rating = splitted[0]

            summary = self.extract_all_xpath(review, summary_xpath)
            pros = self.extract_all_xpath(review, pros_xpath, separator=' ; ')
            cons = self.extract_all_xpath(review, cons_xpath, separator=' ; ')
            author = ''
            header = self.extract_all_xpath(review, header_xpath)
            if header:
                author = header.split('|')
                author = strip(author[0])

            user_review = ReviewItem.from_product(product=product, tp='USER', rating=rating,
                                                  title=title, date=date, summary=summary,
                                                  pros=pros, cons=cons, author=author, scale=10)
            yield user_review

        next_page_url = self.extract_xpath(response, next_page_xpath)
        if next_page_url:
            next_page_url = get_full_url(response, next_page_url)
            request = Request(next_page_url, callback=self.parse_reviews)
            request.meta['product'] = product
            yield request
예제 #3
0
    def parse_review(self, response):
        review_xpath = "//ul[@class='comments']/li"
        title_xpath = "./p[@class='hdr']/text()"
        summary_xpath = "./p[@class='msg']/text()"
        rating_xpath = "./ul[contains(@class, 'rating')]/@title"
        date_and_author_xpath = "./p[@class='auth']/text()"
        next_page_xpath = "//div[@class='pg']/a[@class='n']/@href"

        product = response.meta['product']
        last_user_review = response.meta['last_user_review']

        for review in response.xpath(review_xpath):
            date_and_author = self.extract_xpath(review, date_and_author_xpath)
            if date_and_author.startswith('Reviewed'):
                date_and_author = date_and_author[len('Reviewed'):]
            date_and_author = date_and_author.split(',')[0]
            splitted = date_and_author.split('by')
            date = splitted[0].strip()
            if len(splitted) > 1:
                author = splitted[1].strip()

            if date:
                date = date_format(date, '')
                current_user_review = dateparser.parse(
                    date, date_formats=['%Y-%m-%d'])
                if current_user_review < last_user_review:
                    return

            title = self.extract_xpath(review, title_xpath)
            rating = self.extract_xpath(review, rating_xpath)
            splitted = rating.split(' out')
            if splitted:
                rating = splitted[0]

            summary = self.extract_all_xpath(review, summary_xpath)

            user_review = ReviewItem.from_product(product=product,
                                                  tp='USER',
                                                  rating=rating,
                                                  title=title,
                                                  date=date,
                                                  summary=summary,
                                                  author=author,
                                                  scale=5)
            yield user_review

        next_page_url = self.extract(response.xpath(next_page_xpath))
        if next_page_url:
            next_page_request = Request(url=get_full_url(
                response, next_page_url),
                                        callback=self.parse_review,
                                        meta=response.meta)
            yield next_page_request
예제 #4
0
    def parse_reviews(self, response):
        next_page_xpath = '//a[@class="next-arrow"]/@href'

        product = response.meta['product']
        last_user_review = incremental_utils.get_latest_user_review_date_by_sii(
            self.mysql_manager, self.spider_conf['source_id'],
            product["source_internal_id"]
        )

        reviews = response.xpath('//div[contains(@class,"reviewWidget")]')
        for review in reviews:
            user_review = ReviewItem()
            date = self.extract(review.xpath('.//span[@class="reviewDate"]/text()'))
            if date:
                user_review['TestDateText'] = date_format(date, '')
                current_user_review = dateparser.parse(user_review['TestDateText'],
                                                       date_formats=['%Y-%m-%d'])
                if current_user_review < last_user_review:
                    return

            user_review['DBaseCategoryName'] = "USER"
            user_review['SourceTestScale'] = 5
            user_review['ProductName'] = product['ProductName']
            user_review['TestUrl'] = product['TestUrl']
            user_review['source_internal_id'] = product['source_internal_id']
            rating = self.extract(review.xpath('.//span[contains(@class,"ratingSpriteUnder")]/@class'))
            rating = rating.strip('ratingSpriteUnder ratingSprite_').replace('-', '.')
            user_review['SourceTestRating'] = rating
            user_review['Author'] = self.extract(review.xpath('.//p[@class="name"]/text()'))
            user_review['TestTitle'] = self.extract(review.xpath('.//h2/text()'))
            user_review['TestSummary'] = self.extract_all(review.xpath('.//div[@class="reviewContainer"]/p/text()'))
            user_review['TestPros'] = self.extract_all(review.xpath('.//ul[@class="pros"]/li/text()'), '; ')
            user_review['TestCons'] = self.extract_all(review.xpath('.//ul[@class="cons"]/li/text()'), '; ')
            yield user_review

        next_page_url = self.extract_xpath(response, next_page_xpath)
        if next_page_url:
            next_page_url = get_full_url(response, next_page_url)
            request = Request(next_page_url, callback=self.parse_reviews)
            request.meta['product'] = product
            yield request
예제 #5
0
    def parse_reviews(self, response):
        jsonresponse = json.loads(response.body_as_unicode())
        if jsonresponse.get('BatchedResults', {}):
            query = jsonresponse['BatchedResults']['q0']
        elif jsonresponse.get('data', {}):
            query = jsonresponse['data']
        else:
            return

        limit = query['Limit']
        offset = query['Offset']
        totalReviews = query['TotalResults']
        stop_scraping = False

        bv_id = response.meta['bv_id']
        filter_other_sources = response.meta.get('filter_other_sources', True)
        extra_parser = response.meta.get('extra_parser', None)

        if offset == 0 and query.get('Includes', {}).get('Products', {}).get(
                bv_id, {}):
            product_info = query['Includes']['Products'][bv_id]

            # if product is not part of metadata, it should be parsed through BV API
            product = response.meta.get('product', None)
            if not product:
                product = self.parse_product_from_bv(
                    product_info,
                    response.meta.get('OriginalCategoryName', ''),
                    source_internal_id=response.meta.get('product_id', None))
                response.meta['product'] = product
                yield product

            for ean in product_info.get('EANs', []):
                ean_id = ProductIdItem.from_product(product,
                                                    kind='EAN',
                                                    value=ean)
                yield ean_id

            for upc in product_info.get('UPCs', []):
                upc_id = ProductIdItem.from_product(product,
                                                    kind='UPC',
                                                    value=upc)
                yield upc_id

            # The MPNs provided by BV API of some sources seem not very accurate.
            # They are more like source internal IDs.
            #for mpn in product_info.get('ManufacturerPartNumbers', []):
            #    mpn_id = self.product_id(product, kind='MPN', value=mpn)
            #    yield mpn_id

        for review in query['Results']:
            # review is from another product, skip
            if review.get('ProductId', bv_id) != bv_id:
                continue

            if filter_other_sources and review.get('IsSyndicated', True):
                continue

            parsedReview = self.parse_review(response, review, extra_parser)
            if response.meta.get('last_user_review', ''):
                current_user_review = dateparser.parse(
                    parsedReview['TestDateText'], date_formats=['%Y-%m-%d'])
                if current_user_review < response.meta['last_user_review']:
                    stop_scraping = True
                    break

            yield parsedReview

        # first review page, now we know the total review count and
        # thus can trigger all other pages parsing.
        if offset + limit < totalReviews and not stop_scraping:
            # there is a need to call the next page of reviews
            bv_params = self.bv_base_params.copy()
            bv_params['bv_id'] = bv_id
            bv_params['offset'] = offset + limit
            fullUrl = self.get_review_url(**bv_params)
            request = Request(fullUrl,
                              callback=self.parse_reviews,
                              meta=response.meta)
            offset += limit
            yield request