def parse_review(self, response): next_page_xpath = "(//*[@rel='next']/@href)[1]" default_rating_xpath = './/reevoo-score/@data-score' product = response.meta['product'] reviews = response.xpath('//article[contains(@id,"review_")]') if not reviews: return # From observation, at least currys.co.uk uses a different format to present review rating rating_xpath = response.meta.get('rating_xpath', '') if not rating_xpath: rating_xpath = default_rating_xpath last_user_review = incremental_utils.get_latest_user_review_date_by_sii( self.mysql_manager, self.spider_conf['source_id'], product["source_internal_id"]) for review in reviews: user_review = ReviewItem() date = self.extract( review.xpath( './/span[contains(@class, "date_publish")]/text()')) if date: user_review['TestDateText'] = date_format(date, '') current_user_review = dateparser.parse( user_review['TestDateText'], date_formats=['%Y-%m-%d']) if current_user_review < last_user_review: return user_review['DBaseCategoryName'] = "USER" user_review['ProductName'] = product['ProductName'] user_review['TestUrl'] = product['TestUrl'] user_review['SourceTestRating'] = self.extract( review.xpath(rating_xpath)) user_review['Author'] = self.extract( review.xpath('.//h4[@class="attribution-name"]/text()')) user_review['TestPros'] = self.extract_all( review.xpath('.//dd[@class="pros"]/text()')) user_review['TestCons'] = self.extract_all( review.xpath('.//dd[@class="cons"]/text()')) user_review['source_internal_id'] = product['source_internal_id'] # All reviews after first empty review are empty if user_review['TestPros'] or user_review['TestCons']: yield user_review else: return next_page_url = self.extract(response.xpath(next_page_xpath)) if next_page_url: next_page_url = get_full_url(response, next_page_url) request = Request(next_page_url, callback=self.parse_review, meta=response.meta) yield request
def parse_reviews(self, response): product = response.meta['product'] rating_xpath = ".//*[@class='review--header-rating']/text()" title_xpath = ".//h3[contains(@class, 'review--header-title')]/text()" summary_xpath = ".//div[contains(@class, 'review--description')]//text()" header_xpath = ".//div[@class='review--header-review-info']//text()" date_xpath = ".//div[@class='review--header-review-info']/time/@datetime" pros_xpath = ".//li[contains(@class, 'pros-and-cons-pro')]//*[@class!='is-visually-hidden']/text()" cons_xpath = ".//li[contains(@class, 'pros-and-cons-con')]//*[@class!='is-visually-hidden']/text()" next_page_xpath = "//a[@rel='next']/@href" reviews = response.xpath("//li[contains(@class, 'reviews__list-item')]") last_user_review = incremental_utils.get_latest_user_review_date_by_sii( self.mysql_manager, self.spider_conf['source_id'], product["source_internal_id"] ) for review in reviews: date = self.extract_xpath(review, date_xpath) if date: date = date_format(date, '') current_user_review = dateparser.parse(date, date_formats=['%Y-%m-%d']) if current_user_review < last_user_review: return title = self.extract_xpath(review, title_xpath) rating = self.extract_xpath(review, rating_xpath) splitted = rating.split('/') if splitted: rating = splitted[0] summary = self.extract_all_xpath(review, summary_xpath) pros = self.extract_all_xpath(review, pros_xpath, separator=' ; ') cons = self.extract_all_xpath(review, cons_xpath, separator=' ; ') author = '' header = self.extract_all_xpath(review, header_xpath) if header: author = header.split('|') author = strip(author[0]) user_review = ReviewItem.from_product(product=product, tp='USER', rating=rating, title=title, date=date, summary=summary, pros=pros, cons=cons, author=author, scale=10) yield user_review next_page_url = self.extract_xpath(response, next_page_xpath) if next_page_url: next_page_url = get_full_url(response, next_page_url) request = Request(next_page_url, callback=self.parse_reviews) request.meta['product'] = product yield request
def parse_review(self, response): review_xpath = "//ul[@class='comments']/li" title_xpath = "./p[@class='hdr']/text()" summary_xpath = "./p[@class='msg']/text()" rating_xpath = "./ul[contains(@class, 'rating')]/@title" date_and_author_xpath = "./p[@class='auth']/text()" next_page_xpath = "//div[@class='pg']/a[@class='n']/@href" product = response.meta['product'] last_user_review = response.meta['last_user_review'] for review in response.xpath(review_xpath): date_and_author = self.extract_xpath(review, date_and_author_xpath) if date_and_author.startswith('Reviewed'): date_and_author = date_and_author[len('Reviewed'):] date_and_author = date_and_author.split(',')[0] splitted = date_and_author.split('by') date = splitted[0].strip() if len(splitted) > 1: author = splitted[1].strip() if date: date = date_format(date, '') current_user_review = dateparser.parse( date, date_formats=['%Y-%m-%d']) if current_user_review < last_user_review: return title = self.extract_xpath(review, title_xpath) rating = self.extract_xpath(review, rating_xpath) splitted = rating.split(' out') if splitted: rating = splitted[0] summary = self.extract_all_xpath(review, summary_xpath) user_review = ReviewItem.from_product(product=product, tp='USER', rating=rating, title=title, date=date, summary=summary, author=author, scale=5) yield user_review next_page_url = self.extract(response.xpath(next_page_xpath)) if next_page_url: next_page_request = Request(url=get_full_url( response, next_page_url), callback=self.parse_review, meta=response.meta) yield next_page_request
def parse_reviews(self, response): next_page_xpath = '//a[@class="next-arrow"]/@href' product = response.meta['product'] last_user_review = incremental_utils.get_latest_user_review_date_by_sii( self.mysql_manager, self.spider_conf['source_id'], product["source_internal_id"] ) reviews = response.xpath('//div[contains(@class,"reviewWidget")]') for review in reviews: user_review = ReviewItem() date = self.extract(review.xpath('.//span[@class="reviewDate"]/text()')) if date: user_review['TestDateText'] = date_format(date, '') current_user_review = dateparser.parse(user_review['TestDateText'], date_formats=['%Y-%m-%d']) if current_user_review < last_user_review: return user_review['DBaseCategoryName'] = "USER" user_review['SourceTestScale'] = 5 user_review['ProductName'] = product['ProductName'] user_review['TestUrl'] = product['TestUrl'] user_review['source_internal_id'] = product['source_internal_id'] rating = self.extract(review.xpath('.//span[contains(@class,"ratingSpriteUnder")]/@class')) rating = rating.strip('ratingSpriteUnder ratingSprite_').replace('-', '.') user_review['SourceTestRating'] = rating user_review['Author'] = self.extract(review.xpath('.//p[@class="name"]/text()')) user_review['TestTitle'] = self.extract(review.xpath('.//h2/text()')) user_review['TestSummary'] = self.extract_all(review.xpath('.//div[@class="reviewContainer"]/p/text()')) user_review['TestPros'] = self.extract_all(review.xpath('.//ul[@class="pros"]/li/text()'), '; ') user_review['TestCons'] = self.extract_all(review.xpath('.//ul[@class="cons"]/li/text()'), '; ') yield user_review next_page_url = self.extract_xpath(response, next_page_xpath) if next_page_url: next_page_url = get_full_url(response, next_page_url) request = Request(next_page_url, callback=self.parse_reviews) request.meta['product'] = product yield request
def parse_reviews(self, response): jsonresponse = json.loads(response.body_as_unicode()) if jsonresponse.get('BatchedResults', {}): query = jsonresponse['BatchedResults']['q0'] elif jsonresponse.get('data', {}): query = jsonresponse['data'] else: return limit = query['Limit'] offset = query['Offset'] totalReviews = query['TotalResults'] stop_scraping = False bv_id = response.meta['bv_id'] filter_other_sources = response.meta.get('filter_other_sources', True) extra_parser = response.meta.get('extra_parser', None) if offset == 0 and query.get('Includes', {}).get('Products', {}).get( bv_id, {}): product_info = query['Includes']['Products'][bv_id] # if product is not part of metadata, it should be parsed through BV API product = response.meta.get('product', None) if not product: product = self.parse_product_from_bv( product_info, response.meta.get('OriginalCategoryName', ''), source_internal_id=response.meta.get('product_id', None)) response.meta['product'] = product yield product for ean in product_info.get('EANs', []): ean_id = ProductIdItem.from_product(product, kind='EAN', value=ean) yield ean_id for upc in product_info.get('UPCs', []): upc_id = ProductIdItem.from_product(product, kind='UPC', value=upc) yield upc_id # The MPNs provided by BV API of some sources seem not very accurate. # They are more like source internal IDs. #for mpn in product_info.get('ManufacturerPartNumbers', []): # mpn_id = self.product_id(product, kind='MPN', value=mpn) # yield mpn_id for review in query['Results']: # review is from another product, skip if review.get('ProductId', bv_id) != bv_id: continue if filter_other_sources and review.get('IsSyndicated', True): continue parsedReview = self.parse_review(response, review, extra_parser) if response.meta.get('last_user_review', ''): current_user_review = dateparser.parse( parsedReview['TestDateText'], date_formats=['%Y-%m-%d']) if current_user_review < response.meta['last_user_review']: stop_scraping = True break yield parsedReview # first review page, now we know the total review count and # thus can trigger all other pages parsing. if offset + limit < totalReviews and not stop_scraping: # there is a need to call the next page of reviews bv_params = self.bv_base_params.copy() bv_params['bv_id'] = bv_id bv_params['offset'] = offset + limit fullUrl = self.get_review_url(**bv_params) request = Request(fullUrl, callback=self.parse_reviews, meta=response.meta) offset += limit yield request