def _populate_buyer_reviews(self, response, product): if "buyer_reviews" in product: del product['buyer_reviews'] revs = response.xpath('//div[@id="reviews"]/div[@id="reviews"]') if not revs: product['buyer_reviews'] = ZERO_REVIEWS_VALUE return total = response.xpath( '//div[@class="_Ape"]/div/div/div[@class="_wpe"]/text()').extract( ) if not total: cond_set_value(product, 'buyer_reviews', ZERO_REVIEWS_VALUE) return total = re.findall("\d*,?\d+", total[0]) total = int(total[0].replace(',', '')) reviews = response.xpath( '//div[@id="reviews"]/div[@id="reviews"]//div[@class="_Joe"]' '/div/a/div[@class="_Roe"]/@style' '/div/a/div[@class="_Roe"]/@style |' '//div[@id="reviews"]//a/div[@class="_Roe"]/@style').extract() star = 5 by_star = {} for rev in reviews: percents = re.findall("width:(\d+\.?\d*)\%", rev)[0] rev_number = total * float(percents) / 100 rev_number = int(round(rev_number)) by_star[star] = rev_number star -= 1 avg = float( sum([star * rating for star, rating in by_star.iteritems()])) avg /= total reviews = BuyerReviews(num_of_reviews=total, average_rating=round(avg, 1), rating_by_star=by_star) cond_set_value(product, 'buyer_reviews', reviews)
def _parse_buyer_review(response, product_response): num_reviews = product_response.xpath( '//*[@itemprop="reviewCount"]/@content').extract()[0] average_rating = product_response.xpath( '//*[@itemprop="ratingValue"]/@content').extract()[0] rating_by_star = {'1': 0, '2': 0, '3': 0, '4': 0, '5': 0} stars = product_response.xpath( '//*[@class="pr-ratings-histogram-content"]' '//p[@class="pr-histogram-label"]//span/text()').re('\d+') values = product_response.xpath( '//*[@class="pr-ratings-histogram-content"]' '//p[@class="pr-histogram-count"]//span/text()').re('\d+') for (star, value) in zip(stars, map(int, values)): rating_by_star[star] += value stars = response.xpath('//*[@class="pr-info-graphic-amazon"]' '//dd/text()').re('(\d+) star') values = response.xpath('//*[@class="pr-info-graphic-amazon"]' '//dd/text()').re('\((\d+)\)') for (star, value) in zip(stars, map(int, values)): rating_by_star[star] += value buyer_reviews = BuyerReviews(num_of_reviews=num_reviews, average_rating=average_rating, rating_by_star=rating_by_star) return buyer_reviews or None
def _request_buyer_reviews(self, response): anonim_reviews = response.xpath('//div[@class="reevooReview"]') if anonim_reviews: total = len(anonim_reviews) stars = {} for review in anonim_reviews: regex = 'Score is (\d+)' count = review.xpath( '//div[@class="unverified_stars"]/@title').re(regex)[0] if count in stars.keys(): stars[count] += 1 else: stars[count] = 1 sum = 0 for k, v in stars.iteritems(): sum += int(k) * v avg = float(sum) / float(total) res = BuyerReviews(num_of_reviews=total, average_rating=avg, rating_by_star=stars) if total: response.meta['product']['buyer_reviews'] = res else: response.meta['product']['buyer_reviews'] = ZERO_REVIEWS_VALUE else: sku = response.css('p.productid::attr(class)').re('p_(\d+)') sku = sku[0] if sku else re.search('.+/([^,]+)', response.url).group(1) url = self.REVOO_URL.format(sku=sku) return url
def _parse_buyer_reviews(self, response): num_of_reviews = is_empty( response.xpath( '//meta[@itemprop="reviewCount"]/@content').extract()) if num_of_reviews: # Get average rating average_rating = is_empty( response.xpath( '//meta[@itemprop="ratingValue"]/@content').extract(), 0.0) # Count rating by star rating_by_star = {'1': 0, '2': 0, '3': 0, '4': 0, '5': 0} stars = response.xpath('//*[@id="reviews"]/./' '/li/.//meta[@itemprop="ratingValue"]' '/@content').extract() for star in stars: rating_by_star[star] += 1 buyer_reviews = { 'num_of_reviews': int(num_of_reviews), 'average_rating': float(average_rating), 'rating_by_star': rating_by_star } else: buyer_reviews = self.ZERO_REVIEWS_VALUE return BuyerReviews(**buyer_reviews)
def _parse_prod_info_js(self, response): meta = response.meta.copy() reqs = meta.get("reqs") product = meta['product'] data = response.body_as_unicode() data = is_empty(re.findall(r'bvGetReviewSummaries\((.+)\)', data)) if data: data = json.loads(data) results = is_empty(data.get('Results', [])) if results: # Buyer reviews buyer_reviews = self._parse_buyer_reviews(results, response) product['buyer_reviews'] = BuyerReviews(**buyer_reviews) # Get brand self._parse_brand(response, results) # Get department self._parse_department(response, results) if reqs: return self.send_next_request(reqs, response) return product
def _parse_buyer_reviews(self, response): scores = response.meta.get('scores', []) css = '.overall_score_stars::attr(title)' scores.extend(map(int, response.css(css).extract())) response.meta['scores'] = scores next_url = response.css('.next_page::attr(href)') if next_url: next_url = urljoin(response.url, next_url[0].extract()) return Request(next_url, self._parse_buyer_reviews, meta=response.meta) try: avg, total = self._scrape_review_summary(response) except ValueError: response.meta['product']['buyer_reviews'] = ZERO_REVIEWS_VALUE return if not total: response.meta['product']['buyer_reviews'] = ZERO_REVIEWS_VALUE return avg = float(avg) total = int(total) by_star = {score: scores.count(score) for score in scores} res = BuyerReviews(num_of_reviews=total, average_rating=avg, rating_by_star=by_star) response.meta['product']['buyer_reviews'] = res
def parse_buyer_reviews(self, response): product = response.meta.get("product") reqs = response.meta.get("reqs") total = int( is_empty( response.xpath( "//span[contains(@class, 'BVRRRatingSummaryHeaderCounterValue')]" "/text()").re(FLOATING_POINT_RGEX), 0)) average = float( is_empty(re.findall("avgRating\"\:(\d+\.\d+)", response.body), 0)) rbs = response.xpath( "//span[contains(@class, 'BVRRHistAbsLabel')]/text()").extract( )[:5] rbs.reverse() rating_by_star = {} if rbs: for i in range(5, 0, -1): rating_by_star[i] = int(rbs[i - 1].replace("\n", "").replace( "\t", "").replace("\\n", "")) if total and average: product["buyer_reviews"] = BuyerReviews( num_of_reviews=total, average_rating=average, rating_by_star=rating_by_star) else: product["buyer_reviews"] = ZERO_REVIEWS_VALUE if reqs: return self.send_next_request(reqs, response) return product
def _parse_buyer_reviews(self, response): avg = response.xpath( '//*[@class="pr-rating pr-rounded average"]/text()').extract() avg = (float(avg[0]) if avg else 0.0) num_reviews = response.xpath( '//*[@class="pr-snapshot-average-based-on-text"]' '/span/text()').extract() num_reviews = (int(num_reviews[0]) if num_reviews else 0) ratings_by_star = {'1': 0, '2': 0, '3': 0, '4': 0, '5': 0} keys = response.xpath( '(//ul[@class="pr-ratings-histogram-content"])[1]' '//*[@class="pr-histogram-label"]//span/text()').re('(\d+) Stars') values = response.xpath( '(//ul[@class="pr-ratings-histogram-content"])[1]' '//*[@class="pr-histogram-count"]/span').re('(\d+)') for (key, value) in zip(keys, values): ratings_by_star[key] = int(value) return BuyerReviews(num_of_reviews=num_reviews, average_rating=avg, rating_by_star=ratings_by_star)
def parse_buyer_reviews(self, response): meta = response.meta.copy() buyer_reviews_per_page = self.br.parse_buyer_reviews_per_page(response) for k, v in buyer_reviews_per_page['rating_by_star'].iteritems(): response.meta['marks'][k] += v product = response.meta['product'] reqs = meta.get('reqs') product['buyer_reviews'] = BuyerReviews( num_of_reviews=buyer_reviews_per_page['num_of_reviews'], average_rating=buyer_reviews_per_page['average_rating'], rating_by_star=response.meta['marks']) if reqs: reqs.append( Request(url=self.RELATED_PRODUCT.format( product_id=self.product_id, index=0), dont_filter=True, callback=self.parse_related_product)) if reqs: return self.send_next_request(reqs, response) return product
def _get_buyer_reviews(self, response): average = response.xpath( '//*[contains(@class, "average-score")]' '[contains(@itemprop, "ratingValue")]//text()').extract() if not average: return try: average = float(average[0]) except: self.log('Invalid buyer reviews at %s' % response.url) return num = response.xpath( '//meta[contains(@itemprop, "reviewCount")]/@content').extract() num = int(num[0].replace(',', '')) # scrape rating by star rating_by_star = {} for star_num, star_breakdown in enumerate( response.xpath('//*[contains(@id, "ratings-tooltip")]' '//*[contains(@class, "star-breakdowns")]' '//*[contains(@class, "star-breakdown")]')): current_mark = 5 - star_num if star_num >= 5: break star_count = star_breakdown.css( '.star-count ::text').extract()[0].replace(',', '') rating_by_star[str(current_mark)] = int( re.search('(\d+)', star_count).group(1)) return BuyerReviews(num_of_reviews=num, average_rating=average, rating_by_star=rating_by_star)
def _parse_review_api(self, response): product = response.meta['product'] reqs = response.meta.get('reqs', []) res = re.findall('\{.*\}', response.body)[0] data = json.loads(res) product['brand'] = data['BatchedResults']['q0']['Results'][0]['Brand']['Name'] by_star = {} stars = data['BatchedResults']['q0']['Results'][0][ 'ReviewStatistics']['RatingDistribution'] for star in stars: by_star[star['RatingValue']] = star['Count'] total = data['BatchedResults']['q0']['Results'][0][ 'ReviewStatistics']['TotalReviewCount'] if total == 0: product['buyer_reviews'] = ZERO_REVIEWS_VALUE else: avg = round(data['BatchedResults']['q0']['Results'][0][ 'ReviewStatistics']['AverageOverallRating'], 1) product['buyer_reviews'] = BuyerReviews(num_of_reviews=total, average_rating=avg, rating_by_star=by_star) if reqs: return self.send_next_request(reqs, response) return product
def populate_by_star(self, response): # maybe some optimisation will required for this method total_scores = response.meta.get('total_scores', []) scores = response.xpath( '//article[contains(@id, "review")]' '//span[contains(@class, "overall_score")]/@title').extract() total_scores.extend(scores) next_url = response.xpath('//a[@class="next_page"]/@href').extract() if next_url: url = 'http://mark.reevoo.com' + next_url[0] meta = response.meta.copy() meta['total_scores'] = total_scores return Request(url, callback=self.populate_by_star, meta=meta) stars = {} for number in range(1, 11): pattern = '%s out of 10' % number counted = total_scores.count(pattern) stars[number] = counted avg_total = response.xpath( '//div[@class="average_score"]/@title').extract() avg = re.findall(r'is\s(.*)\sout', avg_total[0]) avg = float(avg[0]) total = re.findall(r'from\s(\d+)\sreview', avg_total[0]) total = int(total[0]) product = response.meta['product'] if total: product['buyer_reviews'] = BuyerReviews(total, avg, stars) else: product['buyer_reviews'] = ZERO_REVIEWS_VALUE return product
def parse_buyer_reviews(self, response): meta = response.meta.copy() buyer_reviews_per_page = self.br.parse_buyer_reviews_per_page(response) for k, v in buyer_reviews_per_page['rating_by_star'].iteritems(): response.meta['marks'][k] += v product = response.meta['product'] reqs = meta.get('reqs', []) product['buyer_reviews'] = BuyerReviews( num_of_reviews=buyer_reviews_per_page['num_of_reviews'], average_rating=buyer_reviews_per_page['average_rating'], rating_by_star=response.meta['marks'] ) # Updated related product url, previous res-x doesn't work product_id = self.product_id + 'US' url = self.RELATED_PRODUCT.format(product_id=product_id, product_categories=self.product_categories, product_url=product.get('url')) reqs.append( Request( url=url, dont_filter=True, callback=self.parse_related_product, meta=meta )) return self.send_next_request(reqs, response)
def _parse_buyer_reviews(self, response): average_rating = is_empty( response.xpath('//div[@id="prod_ratings"]//span[@class="pr-rating ' 'pr-rounded average"]/text()').extract(), 0.0) num_of_reviews = is_empty( response.xpath('//div[@id="prod_ratings"]//span[@class="count"]' '/text()').extract(), 0) evaluetion = response.xpath( '//p[@class="pr-histogram-count"]/span/text()').re(r'\d+')[:5] if average_rating: average_rating = float(average_rating) if num_of_reviews: num_of_reviews = int(num_of_reviews) if evaluetion: evaluetion.reverse() rating_by_star = {1: 0, 2: 0, 3: 0, 4: 0, 5: 0} if num_of_reviews > 0: for index, i in enumerate(evaluetion): rating_by_star[index + 1] = int(i) return BuyerReviews(num_of_reviews, average_rating, rating_by_star)
def _parse_buyer_review(self, response): rating_by_star = {'1': 0, '2': 0, '3': 0, '4': 0, '5': 0} stars = response.xpath('//div[@class="box-collateral box-reviews"]' '/dl/dt/div/div/@style').extract() points = [] for star in stars: point = re.findall(r'(\d+)', star) if point[0] == '100': points.append(5) elif point[0] == '80': points.append(4) elif point[0] == '60': points.append(3) elif point[0] == '40': points.append(2) elif point[0] == '20': points.append(1) for point in points: rating_by_star[str(point)] += 1 average_rating = response.xpath('//meta[@itemprop="ratingValue"]' '/@content').extract() num_of_reviews = len(points) if stars: buyer_reviews = { 'num_of_reviews': int(num_of_reviews), 'average_rating': float(average_rating[0]), 'rating_by_star': rating_by_star } else: return ZERO_REVIEWS_VALUE return BuyerReviews(**buyer_reviews)
def populate_reviews(response, reviews): """ Populate `buyer_reviews` from list of user ratings as floats """ if reviews: by_star = {rating: reviews.count(rating) for rating in reviews} reviews = BuyerReviews(num_of_reviews=len(reviews), average_rating=sum(reviews) / len(reviews), rating_by_star=by_star) cond_set_value(response.meta['product'], 'buyer_reviews', reviews)
def _parse_buyer_reviews(self, response): product = response.meta['product'] reqs = response.meta.get('reqs', []) content = re.search('BVRRRatingSummarySourceID":"(.+?)\},', response._body).group(1).replace('\\"', '"') content = content.replace("\\/", "/") review_html = html.fromstring(content) arr = review_html.xpath( '//div[contains(@class,"BVRRQuickTakeSection")]' '//div[contains(@class,"BVRRRatingOverall")]' '//img[contains(@class,"BVImgOrSprite")]/@title') if len(arr) > 0: average_rating = float(arr[0].strip().split(" ")[0]) else: average_rating = 0.0 arr = review_html.xpath( '//div[contains(@class,"BVRRReviewDisplayStyle5")]' '//div[contains(@class,"BVRRReviewDisplayStyle5Header")]' '//span[@itemprop="ratingValue"]//text()') num_of_reviews = len(arr) review_list = [[5 - i, arr.count(str(5 - i))] for i in range(5)] if review_list: # average score sum = 0 cnt = 0 for i, review in review_list: sum += review * i cnt += review # average_rating = float(sum)/cnt # number of reviews num_of_reviews = 0 for i, review in review_list: num_of_reviews += review else: pass rating_by_star = {1: 0, 2: 0, 3: 0, 4: 0, 5: 0} for i, review in review_list: rating_by_star[i] = review if average_rating and num_of_reviews: product["buyer_reviews"] = BuyerReviews( num_of_reviews=int(num_of_reviews), average_rating=float(average_rating), rating_by_star=rating_by_star, ) else: product["buyer_reviews"] = ZERO_REVIEWS_VALUE if reqs: return self.send_next_request(reqs, response) return product
def _parse_bv(self, response): product = response.meta['product'] text = response.body_as_unicode().encode('utf-8') if response.status == 200: x = re.search(r"var materials=(.*),\sinitializers=", text, re.M + re.S) if x: jtext = x.group(1) jdata = json.loads(jtext) html = jdata['BVRRRatingSummarySourceID'] sel = Selector(text=html.encode('utf-8')) m = re.search(r'"avgRating":(.*?),', text, re.M) if m: avrg = m.group(1) try: avrg = float(avrg) except ValueError: avrg = 0.0 total = sel.xpath( "//div[@class='BVRRHistogram']" "/div[@class='BVRRHistogramTitle']" "/span[contains(@class,'BVRRNonZeroCount')]" "/span[@class='BVRRNumber']/text()").extract() if total: try: total = int(total[0]) except ValueError: total = 0 else: total = 0 hist = sel.xpath( "//div[@class='BVRRHistogram']" "/div[@class='BVRRHistogramContent']" "/div[contains(@class,'BVRRHistogramBarRow')]") distribution = {} for ih in hist: name = ih.xpath("span/span[@class='BVRRHistStarLabelText']" "/text()").re("(\d) star") try: if name: name = int(name[0]) value = ih.xpath( "span[@class='BVRRHistAbsLabel']/text()").extract( ) if value: value = int(value[0]) distribution[name] = value except ValueError: pass if distribution: reviews = BuyerReviews(total, avrg, distribution) cond_set_value(product, 'buyer_reviews', reviews) elif not total: cond_set_value(product, 'buyer_reviews', ZERO_REVIEWS_VALUE) return product
def parse_buyer_reviews(self, response): product = response.meta.get("product") reqs = response.meta.get("reqs") total = 0 rev = is_empty(re.findall("temp\s+=\s+\(([^\)]*)", response.body), "") try: rev = json.loads(rev) except ValueError: rev = {} if rev: for v in rev.values(): total += int(v) avg = is_empty( response.xpath( "//p[contains(@class, 'ig-heading')]/span/text()").extract( ), 0) if avg: avg = float(is_empty(re.findall("([^\/]*)", str(avg)), 0)) else: avg = float( is_empty( response.xpath( "//div[contains(@class, 'ratetxt')]/span[1]/text()"). re(FLOATING_POINT_RGEX), 0)) for item in response.xpath("//div[contains(@class, 'row')]/span"): star = is_empty( item.xpath("span[1]/text()").re(FLOATING_POINT_RGEX)) if not star: continue rev[star] = is_empty( item.xpath("span[last()]/text()").re(FLOATING_POINT_RGEX)) for item in response.xpath("//div[contains(@class, 'row')]"): star = is_empty( item.xpath("span[1]/text()").re(FLOATING_POINT_RGEX)) if not star: continue rev[star] = is_empty( item.xpath("span[last()]/text()").re(FLOATING_POINT_RGEX)) for v in rev.values(): total += int(v) if avg and total: product["buyer_reviews"] = BuyerReviews(num_of_reviews=total, average_rating=avg, rating_by_star=rev) else: product["buyer_reviews"] = 0 if reqs: return self.send_next_request(reqs, response) return product
def _parse_review(self, response): prod = response.meta['product'] num, avg, by_star = prod['buyer_reviews'] data = json.loads(response.body_as_unicode()) by_star = {1: 0, 2: 0, 3: 0, 4: 0, 5: 0} reviews = data['Results'] for review in reviews: by_star[review['Rating']] += 1 prod['buyer_reviews'] = BuyerReviews(num, avg, by_star) return prod
def parse_buyer_reviews(self, response): meta = response.meta.copy() product = response.meta['product'] reqs = meta.get('reqs', []) try: jsonresponse = json.loads(response.body_as_unicode()) response_selector = Selector( text=self._htmlspecialchars_decode(jsonresponse.get('result'))) try: num_reviews = response_selector.xpath( '//span[@class="font-color-gray based-on"]/text()').re( '\d+')[0] except IndexError: num_reviews = 0 try: avg_rating = response_selector.xpath( '//span[@class="yotpo-star-digits"]/text()').extract( )[0].strip() except IndexError: avg_rating = 0 review_stars = response_selector.xpath( '//span[contains(@class, "yotpo-sum-reviews")]/text()').re( '\((\d+)\)')[::-1] stars = product['buyer_reviews'].rating_by_star for star_index, star_value in enumerate(review_stars): star_index = str(star_index + 1) stars[star_index] = star_value last_date = response_selector.xpath( '//label[contains(@class, "yotpo-review-date")]/text()' ).extract() product['buyer_reviews'] = BuyerReviews(num_of_reviews=num_reviews, average_rating=avg_rating, rating_by_star=stars) if last_date: last_buyer_review_date = datetime.datetime.strptime( last_date[0], '%m/%d/%y') product[ 'last_buyer_review_date'] = last_buyer_review_date.strftime( '%d-%m-%Y') except BaseException as e: self.log("Error extracting buyers reviews - {}".format(e), WARNING) if 'No JSON object could be decoded' in e: self.log("Repeating buyers reviews request", WARNING) reqs.append( Request(response.url, callback=self.get_price_and_stockstatus, meta=meta, dont_filter=True)) if reqs: return self.send_next_request(reqs, response) else: return product
def _no_parse_reviews(self, response): product = response.request.meta['product'] reqs = response.request.meta.get('reqs', []) rating_by_star = {'1': 0, '2': 0, '3': 0, '4': 0, '5': 0} product['buyer_reviews'] = BuyerReviews(num_of_reviews=0, average_rating=0, rating_by_star=rating_by_star) if reqs: return self.send_next_request(reqs, response) return product
def parse_buyer_reviews(self, response): total = response.xpath( '//div[contains(@class, "pr-snapshot-rating")]//span[contains(@class, "pr-rating")]/text()' ).extract()[0] avg = response.xpath( '//div[contains(@class, "pr-snapshot-rating")]//p[contains(@class, "pr-snapshot-average-based-on-text")]/span[@class="count"]/text()' ).extract()[0] ratings = [] cond_set_value(response.meta['product'], 'buyer_reviews', BuyerReviews(total, avg, ratings) if total else ZERO_REVIEWS_VALUE)
def _parse_reviews(self, response, product): product_id = response.css('#productId::attr(value)').extract() if not product_id: product_id = response.xpath('//*[contains(@class,"productID")]' '[contains(text(), "Web ID:")]/text()').extract() if product_id: product_id = [''.join([c for c in product_id[0] if c.isdigit()])] if product_id: # Reviews url = "http://macys.ugc.bazaarvoice.com/7129aa/%s" \ "/reviews.djs?format=embeddedhtml" % (product_id[0],) r = requests.get(url) resp = r.text resp = re.findall("var materials=(.*)", resp) if resp: resp = resp[0] data = json.loads(resp[0:-1]) hxs = HtmlXPathSelector(text=data["BVRRSourceID"]) num_of_reviews = hxs.xpath( '//div[@id="BVRRQuickTakeSummaryID"]' '/div/div/div/div/div/div/div/div/span' '/span[contains(@class, "BVRRNumber")]/text()' ).extract() if num_of_reviews: num_of_reviews = int(num_of_reviews[0].replace(',', '')) array = hxs.xpath( '//div/span[@class="BVRRHistAbsLabel"]/text()' ).extract() if array: rating_by_star = {} array = list(array) array.reverse() count = 0 review_sum = 0 for i in range(0, 5): rating_by_star[i + 1] = array[i].replace(',', '') count += int(array[i].replace(',', '')) review_sum += (i + 1) * int(array[i].replace(',', '')) average_rating = round( float(review_sum) / float(count), 2) br = BuyerReviews( num_of_reviews, average_rating, rating_by_star ) cond_set_value(product, 'buyer_reviews', br) cond_set_value(product, 'buyer_reviews', ZERO_REVIEWS_VALUE)
def parse_buyer_reviews(self, response): meta = response.meta.copy() reqs = meta['reqs'] self.br.br_count = meta['_br_count'] buyer_reviews_per_page = self.br.parse_buyer_reviews_per_page(response) product = response.meta['product'] product['buyer_reviews'] = BuyerReviews(**buyer_reviews_per_page) if reqs: return self.send_next_request(reqs, response) return product
def _parse_review(self, response): product = response.meta['product'] reqs = response.meta.get('reqs', []) # review_html = html.fromstring( # re.search('(<div id="pluck_reviews_rollup.+?\'\))', contents).group(1) # ) arr = response.xpath( "//div[contains(@class,'pluck-dialog-middle')]" "//span[contains(@class,'pluck-review-full-attributes-name-post')]/text()" ).extract() review_list = [] if len(arr) >= 5: review_list = [[5 - i, int(re.findall('\d+', mark)[0])] for i, mark in enumerate(arr)] if review_list: # average score sum = 0 cnt = 0 for i, review in review_list: sum += review * i cnt += review if cnt > 0: average_rating = float(sum) / cnt else: average_rating = 0.0 # number of reviews num_of_reviews = 0 for i, review in review_list: num_of_reviews += review else: pass rating_by_star = {1: 0, 2: 0, 3: 0, 4: 0, 5: 0} for i, review in review_list: rating_by_star[i] = review if average_rating and num_of_reviews: product["buyer_reviews"] = BuyerReviews( num_of_reviews=int(num_of_reviews), average_rating=float(average_rating), rating_by_star=rating_by_star, ) else: product["buyer_reviews"] = ZERO_REVIEWS_VALUE if reqs: return self.send_next_request(reqs, response) return product
def _populate_buyer_reviews(self, response, product): css = '#customer-reviews .rating::attr(style)' values = response.css(css).re('width:(\d+)') if not values: return values = [int(value) / 20 for value in values] total = len(values) avg = sum(values) / total by_star = {int(value): int(values.count(value)) for value in values} cond_set_value( product, 'buyer_reviews', BuyerReviews(num_of_reviews=total, average_rating=avg, rating_by_star=by_star))
def _parse_reviews(self, response): res = re.findall(r'"attributes":(.*),"ciTrackingEnabled"', response.body) if res: data = json.loads(res[0]) avg = data['avgRating'] avg = float(avg) total = data['numReviews'] total = int(total) stars = {} materials = re.findall(r'materials=(.*),', response.body) if materials: data = json.loads(materials[0]) all_revs = response.meta.get('all_revs', []) pattern = r'itemprop="ratingValue" class="BVRRNumber'\ ' BVRRRatingNumber">(\d+)<' results = re.findall(pattern, data[data.keys()[0]]) all_revs.extend(results) for number in range(1, 6): pattern = str(number) quantity = all_revs.count(pattern) stars[number] = quantity # Buyer reviews populated on page by 8, 9-38, 39-68.. if total > 8: counter = (total - 9) / 30 page_counter = counter + 2 meta = response.meta.copy() page_populated = 2 if not response.meta.get('page_populated'): meta['page_populated'] = page_populated else: page_populated = int(response.meta['page_populated']) + 1 meta['page_populated'] = page_populated initial_url = response.meta.get('initial_url') if not initial_url: initial_url = response.url meta['initial_url'] = initial_url if page_populated <= page_counter: meta['all_revs'] = all_revs next_page_url_part = "&page=%s" % page_populated next_page = initial_url + next_page_url_part return Request(next_page, callback=self._parse_reviews, meta=meta) product = response.meta['product'] cond_set_value( product, 'buyer_reviews', BuyerReviews(total, avg, stars) if total else ZERO_REVIEWS_VALUE)
def _parse_buyer_reviews(self, response): product = response.meta['product'] data = json.loads(response.body) try: data = data["response"]["bottomline"] except KeyError: cond_set_value(product, 'buyer_reviews', ZERO_REVIEWS_VALUE) return ratings = data['star_distribution'] avg = float(data['average_score']) total = data['total_review'] cond_set_value( response.meta['product'], 'buyer_reviews', BuyerReviews(total, avg, ratings) if total else ZERO_REVIEWS_VALUE)
def _get_stars_by_request(self, response): """ Callback for Requast on buyer reviews. In response body we get json with html code. """ meta = response.meta.copy() reqs = meta.get('reqs') product = meta['product'] data = response.body_as_unicode() try: data = json.loads(data) html = data['html'] num_of_reviews = is_empty( re.findall( r'<span id="review_count">\s+(\d+)\s+</span>', html ) ) average_rating = is_empty( re.findall( r'itemprop="ratingValue">\s+(\d.\d)\s+</span>', html ) ) star_rating = re.findall( r'<tr class="histogramrating" data-rating="(\d+)" data-reviewcount="(\d+)">', html ) rating_by_star = {k: int(v) for (k, v) in star_rating} buyer_reviews = { 'num_of_reviews': int(num_of_reviews), 'average_rating': float(average_rating), 'rating_by_star': rating_by_star } product['buyer_reviews'] = BuyerReviews(**buyer_reviews) except Exception as exc: self.log('Unable to parse buyer reviews from {url}: {exc}'.format( url=product['url'], exc=exc ), ERROR) product['buyer_reviews'] = ZERO_REVIEWS_VALUE if reqs: return self.send_next_request(reqs, response) return product