def parse_review(self, node, response): review = ReviewItem() # No author for the source page meta_info = node.get('meta', {}) review['ProductName'] = node.get('title', '') review['source_internal_id'] = meta_info.get('id', '') review['TestDateText'] = meta_info.get('review_date', '') review['TestSummary'] = node.get('description', '') review['TestTitle'] = review.get('ProductName') review['TestUrl'] = node.get('url', '') review['SourceTestRating'] = meta_info.get('expert_evaluation_float', '') # source rating scale based on scale of 10 if review.get('SourceTestRating'): review['SourceTestScale'] = 10 review['source_id'] = self.spider_conf['source_id'] review['DBaseCategoryName'] = 'PRO' if meta_info.get('conclusion', ''): review['TestVerdict'] = meta_info.get('conclusion', '') if meta_info.get('reviewer', ''): review['Author'] = meta_info.get('reviewer', '') return review
def parse_review(self, response): review_json_ld = extruct_helper.extract_json_ld( response.text, "Review") article_json_ld = extruct_helper.extract_json_ld( response.text, "NewsArticle") if review_json_ld: review = extruct_helper.review_item_from_review_json_ld( review_json_ld) elif article_json_ld: review = extruct_helper.review_item_from_article_json_ld( article_json_ld) else: review = ReviewItem() review['DBaseCategoryName'] = 'PRO' if not review.get('TestUrl', ''): review['TestUrl'] = response.url review['ProductName'] = self.extract( response.xpath( "//div[@class='productDataBlock']/ul/li[1]/strong/text()")) if not review.get('ProductName', ''): review['ProductName'] = self.get_product_name(response) source_internal_id = str(response).split("/")[4] review['source_internal_id'] = source_internal_id.rstrip('>') review['TestPros'] = self.extract( response.xpath("//div[@id='ahReviewPros']/ul/li/text()")) review['TestCons'] = self.extract( response.xpath("//div[@id='ahReviewCons']/ul/li/text()")) return review
def parse_review(self, response, reviewData, extra_parser=None): product = response.meta['product'] review = ReviewItem.from_product(product=product, rating=reviewData['Rating'], scale=reviewData['RatingRange'], date=date_format( reviewData['SubmissionTime'], '%Y-%m-%dT%H:%M:%S'), author=reviewData['UserNickname'], title=reviewData['Title'], summary=reviewData['ReviewText'], pros=reviewData['Pros'], cons=reviewData['Cons'], tp='USER') if not review.get('TestPros', ''): review['TestPros'] = ' ; '.join( reviewData.get('TagDimensions', {}).get('Pro', {}).get('Values', [])) if not review.get('TestCons', ''): review['TestCons'] = ' ; '.join( reviewData.get('TagDimensions', {}).get('Con', {}).get('Values', [])) if extra_parser: review = extra_parser(review, reviewData) return review
def _parse_reviews(self, selector, browser, product): review_container_xpath = "//article[contains(@id, 'review_')]" author_xpath = ".//h4[@class='attribution-name']/text()" rating_xpath = ".//div[@class='overall_score_stars']/@title" pros_xpath = ".//dd[@class='pros']/text()" cons_xpath = ".//dd[@class='cons']/text()" next_page_xpath = "//a[@class='next_page']" review_containers = selector.xpath(review_container_xpath) for review_container in review_containers: review = ReviewItem() review['DBaseCategoryName'] = "USER" review['ProductName'] = product['ProductName'] review['TestUrl'] = product['TestUrl'] review['Author'] = self.extract( review_container.xpath(author_xpath)) review['SourceTestRating'] = self.extract( review_container.xpath(rating_xpath)) review['TestPros'] = self.extract_all( review_container.xpath(pros_xpath), separator=' ; ') review['TestCons'] = self.extract_all( review_container.xpath(cons_xpath), separator=' ; ') if review['TestPros'] and review['TestCons']: yield review next_page = selector.xpath(next_page_xpath) if next_page: next_page_selector = browser.click(next_page_xpath) for review in self._parse_reviews(next_page_selector, browser, product): yield review
def init_item_by_xpaths(self, response, item_type, fields, selector=None): if not selector: selector = Selector(response=response) if item_type not in ('review', 'product', 'product_id', 'category'): raise Exception("Invalid item type: %s" % item_type) if item_type == "review": item = ReviewItem() elif item_type == "product": item = ProductItem() elif item_type == "product_id": item = ProductIdItem() elif item_type == "category": item = CategoryItem() if item_type in ('review', 'product'): item["TestUrl"] = response.url for field in fields: # TODO: maybe check field. if item_type == "review" and field in ("TestPros, TestCons"): item[field] = self.extract_all(selector.xpath(fields[field]), " ; ") else: item[field] = self.extract_all(selector.xpath(fields[field])) return item
def parse_reviews(self, response): category = response.meta['category'] product = ProductItem() product['TestUrl'] = response.url product['OriginalCategoryName'] = category['category_path'] product['ProductName'] = self.extract(response.xpath('//span[@class="fn"]/text()')) product_id = response.meta['product_id'] product['PicURL'] = 'http://geizhals.at/p/'+product_id+'.jpg' product['source_internal_id'] = product_id yield product reviews = response.xpath('//li[contains(@class,"gh_box")]') for review in reviews: user_review = ReviewItem() user_review['DBaseCategoryName'] = "USER" user_review['ProductName'] = product['ProductName'] user_review['TestUrl'] = product['TestUrl'] date = self.extract(review.xpath('.//div[@class="userbox"]/text()')).strip('am ') user_review['TestDateText'] = date_format(date, "%d.%m.%Y %H:%M") user_review['SourceTestRating'] = self.extract(review.xpath('.//span[@itemprop="rating"]/text()')) user_review['Author'] = self.extract(review.xpath('.//span[contains(@class,"nick")]/text()')) user_review['TestTitle'] = self.extract(review.xpath('.//h3//text()')) user_review['TestSummary'] = self.extract_all(review.xpath('.//div[@itemprop="description"]//text()')) user_review['source_internal_id'] = product['source_internal_id'] yield user_review
def parse_review(self, response): product = response.meta['product'] review_url = response.meta['review_url'] containers_xpath = "//div[@itemprop='review']" containers = response.xpath(containers_xpath) for review_container in containers: review = ReviewItem() review['SourceTestRating'] = self.extract(review_container.xpath(".//*[@itemprop='ratingValue']/@content")) review['TestDateText'] = self.extract(review_container.xpath(".//span[@itemprop='datePublished']/text()")) review['TestSummary'] = self.extract_all(review_container.xpath(".//p[@itemprop='reviewBody']//text()" "[not(ancestor::a)]")) review['Author'] = self.extract(review_container.xpath(".//a[@itemprop='author']/text()")) review['TestTitle'] = self.extract(review_container.xpath(".//*[@itemprop='name']/text()")) review['TestUrl'] = review_url review["SourceTestScale"] = "5" review['ProductName'] = product['ProductName'] review['source_internal_id'] = product['source_internal_id'] review["DBaseCategoryName"] = "USER" if review["TestDateText"]: review["TestDateText"] = date_format(review["TestDateText"], '') yield review button_next_url = self.extract(response.xpath("//*[@rel='next']/@href")) if button_next_url: button_next_url = get_full_url(response.url, button_next_url) request = Request(button_next_url, callback=self.parse_review, meta=response.meta) yield request
def _parse_reviews(self, selector, browser, product): review_container_xpath = "//div[@data-review-id]" author_xpath = ".//p[@class='pr-review-author-name']/span/text()" rating_xpath = ".//span[contains(@class, 'pr-rating')]/text()" title_xpath = ".//p[@class='pr-review-rating-headline']" test_date_xpath = ".//div[contains(@class, 'pr-review-author-date')]/text()" summary_xpath = ".//p[@class='pr-comments']/text()" next_page_xpath = "//a[@class='next_page']" review_containers = selector.xpath(review_container_xpath) for review_container in review_containers: review = ReviewItem() review['DBaseCategoryName'] = "USER" review['ProductName'] = product['ProductName'] review['TestUrl'] = product['TestUrl'] review['Author'] = self.extract(review_container.xpath(author_xpath)) review['SourceTestRating'] = self.extract(review_container.xpath( rating_xpath)) review['TestTitle'] = self.extract(review_container.xpath(title_xpath)) review['TestSummary'] = self.extract(review_container.xpath(summary_xpath)) review['TestDateText'] = self.extract(review_container.xpath(test_date_xpath)) review['TestDateText'] = date_format(review['TestDateText'], '%d.%m.%Y') yield review
def parse_review(self, response): product = response.meta['product'] user_review = ReviewItem() user_review['DBaseCategoryName'] = "USER" user_review['ProductName'] = product['ProductName'] user_review['TestUrl'] = response.url date = self.extract( response.xpath( '//span[@class="dtreviewed"]/span[@class="value-title"]/@title' )) if date: user_review['TestDateText'] = date_format(date, '') rating = self.extract( response.xpath( '//div[@class="contentBox"]//a[contains(@class,"iReviewStars")]/@title' )) rating = re.findall(r'[^"]+ star', rating) user_review['SourceTestRating'] = rating[0] user_review['Author'] = self.extract( response.xpath('//a[@class="memberName"]/text()')) user_review['TestTitle'] = self.extract( response.xpath('//h3[contains(@class,"reviewTitle")]/text()')) user_review['TestSummary'] = self.extract_all( response.xpath('//div[contains(@class,"reviewText")]//text()')) user_review['TestPros'] = self.extract_all( response.xpath('//span[@class="reviewPros"]/parent::div/text()')) user_review['TestCons'] = self.extract_all( response.xpath('//span[@class="reviewCons"]/parent::div/text()')) yield user_review
def parse_reviews(self, response): review = ReviewItem() product = ProductItem() contents = response.xpath('//article[@class="post-content"]') for content in contents: title = self.extract( content.xpath('.//div//h1[@class="post-title"]//text()')) test_url = self.extract( content.xpath('.//div//h1[@class="post-title"]//a/@href')) author = self.extract( content.xpath('.//span[@itemprop="name"]/text()')) date_str = self.extract_all( content.xpath('.//meta[@itemprop="datePublished"]/@content')) date = date_format(date_str, '%Y-%m-%d') pic = self.extract(content.xpath('.//img/@src')) sumamry = self.extract_all( content.xpath('.//div[@itemprop="articleBody"]//text()')) sid = test_url.split('/')[-2] # product items product['ProductName'] = title product['PicURL'] = pic product['source_internal_id'] = sid product['TestUrl'] = test_url # review review['ProductName'] = title review['TestTitle'] = title review['TestSummary'] = sumamry review['TestUrl'] = test_url review['DBaseCategoryName'] = 'pro' review['source_internal_id'] = sid review['TestDateText'] = date review['Author'] = author yield review yield product
def parse_review(self, response): product = ProductItem() product_name_xpath = "//hearder[@class='gutter-top']/h1[@itemprop='name']/text()" ocn_xpath = "//div[@class='gutter-vertical']//span[@class='tags']/atext()" pic_url_xpath = "//meta[@property='og:image']/text()" product['ProductName'] = self.extract(response.xpath(product_name_xpath)) product['OriginalCategoryName'] = response.meta['category'] product['PicURL'] = self.extract(response.xpath(pic_url_xpath)) yield product testTitle_xpath = "//meta[@property='og:title']/text()" testSummary_xpath = "//div[@class='segment-article gutter-bottom-lg']div[class='row']/div/p/text()" author_xpath = ".//span[@class='review-created-by']/text()" testDateText_xpath = ".//span[@class='review-created-by']/text()" sourceTestRating_xpath = ".//span[@class='review-rating']/img/@src" review = ReviewItem() review["TestUrl"] = response.url review["DBaseCategoryName"] = "USER" review["SourceTestScale"] = "5"; review["ProductName"] = product["ProductName"] review["TestTitle"] = self.extract_all(response.xpath(testTitle_xpath)) review["TestSummary"] = self.extract_all(response.xpath(testSummary_xpath), " ") review["Author"] = self.extract(response.xpath(author_xpath)) review["TestDateText"] = self.extract(response.xpath(testDateText_xpath))
def parse_reviews(self, response): product = response.meta['product'] reviews = response.xpath('//ul[@class="reviews-list"]/li') for review in reviews: user_review = ReviewItem() user_review['DBaseCategoryName'] = "USER" user_review['ProductName'] = product['ProductName'] user_review['TestUrl'] = product['TestUrl'] user_review['source_internal_id'] = product['source_internal_id'] date = self.extract(review.xpath('.//time/@datetime')) if date: user_review['TestDateText'] = date_format(date, "%Y %m %d") rating = self.extract( review.xpath( './/div[contains(@class,"rateit-selected")]/@style')) rating = rating.strip('width:').strip('.00%') user_review['SourceTestRating'] = rating user_review['Author'] = self.extract( review.xpath('.//div[@class="customer"]/span/text()')) user_review['TestTitle'] = self.extract( review.xpath('.//div[@class="title"]/text()')) user_review['TestSummary'] = self.extract_all( review.xpath('.//div[@class="copy"]/p/text()')) yield user_review
def parse_reviews(response): reviews = re.findall(r'"CID":(((?!("Badges")).)+)}', response.body) for item in reviews: try: review = item[0] user_review = ReviewItem() user_review['DBaseCategoryName'] = "USER" user_review['ProductName'] = response.meta['product'][ 'ProductName'] user_review['TestUrl'] = response.meta['product']['TestUrl'] date = re.findall(r'"SubmissionTime":"([\d-]+)', review) user_review['TestDateText'] = date_format(date[0], "%Y-%m-%d") rate = re.findall(r'"Rating":([\d])', review) user_review['SourceTestRating'] = rate[0] author = re.findall(r'"UserNickname":"([^"]+)', review) if author: user_review['Author'] = author[0] title = re.findall(r'"Title":"([^"]+)', review) if title: user_review['TestTitle'] = title[0] summary = re.findall(r'"ReviewText":"([^"]+)', review) if summary: user_review['TestSummary'] = summary[0] yield user_review except: pass
def parse_reviews(response): reviews = re.findall( r'TagDimensions(((?!(TagDimensions|SyndicationSource)).)+)ModerationStatus', response.body) for item in reviews: review = item[0] sii = re.findall(r'"ProductId":"([\d-]+)', review) if response.meta['product']['source_internal_id'] in sii: user_review = ReviewItem() user_review['DBaseCategoryName'] = "USER" user_review['ProductName'] = response.meta['product'][ 'ProductName'] user_review['TestUrl'] = response.meta['product']['TestUrl'] user_review['source_internal_id'] = response.meta['product'][ 'source_internal_id'] date = re.findall(r'"SubmissionTime":"([\d-]+)', review) user_review['TestDateText'] = date_format(date[0], "%Y-%m-%d") rate = re.findall(r'"Rating":([\d])', review) user_review['SourceTestRating'] = rate[0] author = re.findall(r'"UserNickname":"([^"]+)', review) if author: user_review['Author'] = author[0] title = re.findall(r'"Title":"([^"]+)', review) if title: user_review['TestTitle'] = title[0] summary = re.findall(r'"ReviewText":"([^"]+)', review) if summary: user_review['TestSummary'] = summary[0] yield user_review
def parse_reviews(self, response): product = ProductItem() product['TestUrl'] = response.url product['OriginalCategoryName'] = response.meta['category'][ 'category_path'] product['ProductName'] = self.extract(response.xpath('//h1/a/text()')) product['PicURL'] = self.extract( response.xpath('//meta[@property="og:image"]/@content')) product['ProductManufacturer'] = self.extract( response.xpath('//meta[@itemprop="brand"]/@content')) product['source_internal_id'] = self.extract( response.xpath('//@data-product-id')) yield product reviews = response.xpath('//li[@class="opinion-row"]') for review in reviews: user_review = ReviewItem() user_review['DBaseCategoryName'] = "USER" user_review['ProductName'] = product['ProductName'] user_review['TestUrl'] = product['TestUrl'] user_review['source_internal_id'] = product['source_internal_id'] date = self.extract( review.xpath('.//meta[@itemprop="datePublished"]/@content')) user_review['TestDateText'] = date_format(date, "%Y %m %d") user_review['SourceTestRating'] = self.extract( review.xpath('.//meta[@itemprop="ratingValue"]/@content')) user_review['Author'] = self.extract(review.xpath('.//h4/text()')) user_review['TestTitle'] = self.extract( review.xpath('.//div[contains(@class,"grade-text")]/text()')) user_review['TestSummary'] = self.extract_all( review.xpath('.//div[@itemprop="description"]/text()')) yield user_review
def parse_product(self, response): product = ProductItem() product['TestUrl'] = response.url product['OriginalCategoryName'] = response.meta['category']['category_path'] product['ProductName'] = self.extract(response.xpath('//h1/text()')) product['PicURL'] = self.extract(response.xpath('//div[@class="images"]/a/img/@src')) product['ProductManufacturer'] = self.extract( response.xpath('//span[text()="Marca"]/parent::li/span[@class="value"]/text()')) product['source_internal_id'] = self.extract(response.xpath('//input[@id="prodId"]/@value')) yield product reviews = response.xpath('//article[@itemscope]') for review in reviews: user_review = ReviewItem() user_review['DBaseCategoryName'] = "USER" user_review['ProductName'] = product['ProductName'] user_review['source_internal_id'] = product['source_internal_id'] user_review['TestUrl'] = product['TestUrl'] date = self.extract(review.xpath('.//div[@class="date"]/text()')) date_match = re.findall(r'[\d/]{10}', date) if date_match: user_review['TestDateText'] = date_format(date_match[0], "%d/%m/%Y") user_review['SourceTestRating'] = self.extract(review.xpath('.//span[@itemprop="ratingValue"]/text()')) user_review['Author'] = self.extract(review.xpath('.//h2/a/text()')) user_review['TestTitle'] = self.extract(review.xpath('.//h3/a/text()')) user_review['TestSummary'] = self.extract_all(review.xpath('.//p[@itemprop="reviewBody"]/text()')) user_review['TestPros'] = self.extract_all(review.xpath('.//div[@class="pro"]//li/text()'), '; ') user_review['TestCons'] = self.extract_all(review.xpath('.//div[@class="con"]//li/text()'), '; ') yield user_review
def parse_product(self, response): product = ProductItem() product['TestUrl'] = response.url product['OriginalCategoryName'] = response.meta['category']['category_path'] product['ProductName'] = self.extract(response.xpath('//h1[@itemprop="itemreviewed"]/text()')) product['PicURL'] = self.extract(response.xpath('//div[@class="productPhotoGallery"]/div/img/@src')) product['ProductManufacturer'] = self.extract(response.xpath( '//div[@class="manufacturer"]//span[not(text()="brak")]/text()')) yield product reviews = response.xpath( '//div[@class="opinion"][not(descendant::a[contains(text(),"Opinia z serwisu Ceneo.pl")])]') for review in reviews: user_review = ReviewItem() user_review['DBaseCategoryName'] = "USER" user_review['ProductName'] = product['ProductName'] user_review['TestUrl'] = product['TestUrl'] date = self.extract(review.xpath('.//span[@class="date"]/text()')) user_review['TestDateText'] = date_format(date, "%Y-%m-%d") user_review['SourceTestRating'] = self.extract(review.xpath('.//span[@class="points"]/text()')) user_review['Author'] = self.extract_all(review.xpath('.//*[@class="profileName"]//text()')) user_review['TestSummary'] = self.extract_all(review.xpath('.//div[@class="text"]//text()')) user_review['TestPros'] = self.extract_all(review.xpath('.//ul[@class="pluses"]//span/text()'), '; ') user_review['TestCons'] = self.extract_all(review.xpath('.//ul[@class="minuses"]//span/text()'), '; ') yield user_review
def review_item_from_review_json_ld(json_ld, _review=None, overwrite=False): review = _review if _review else ReviewItem() html_parser = HTMLParser() review_rating_obj = json_ld.get('reviewRating', {}) if review_rating_obj and (overwrite or not review.get('SourceTestScale', '')): # according to Google Developers, 5 is the default best rating review['SourceTestScale'] = review_rating_obj.get('bestRating', 5) if review_rating_obj and (overwrite or not review.get('SourceTestRating', '')): review['SourceTestRating'] = review_rating_obj.get('ratingValue', None) if review.get('SourceTestRating') is not None: # Do not assign rating from JSON LD if its value is less than that of worst rating try: # according to Google Developers, 1 is the default worst rating worst_rating = float(review_rating_obj.get('worstRating', 1)) if float(review['SourceTestRating']) < worst_rating: review['SourceTestRating'] = '' except: pass if overwrite or not review.get('ProductName', ''): item_reviewed = json_ld.get('itemReviewed', {}) product_name = item_reviewed.get('name', '') if product_name: review['ProductName'] = html_parser.unescape(product_name).strip() # For all the information we can extract from 'Article' JSON-LD, the way # to extract them from 'Review' JSON-LD is exactly the same review = review_item_from_article_json_ld(json_ld, review, overwrite) return review
def parse_product(self, response): product = ProductItem() product['TestUrl'] = response.url product['OriginalCategoryName'] = response.meta['category']['category_path'] product['ProductName'] = self.extract(response.xpath('//h1/text()')) pic_url = self.extract(response.xpath('//div[@class="product-carousel"]//img[@itemprop="image"][1]/@src')) product['PicURL'] = get_full_url(response, pic_url) product['ProductManufacturer'] = self.extract(response.xpath( '//td[text()="Constructeur"]/following-sibling::td/text()')) yield product reviews = response.xpath('//li[@itemprop="review"]') for review in reviews: user_review = ReviewItem() user_review['DBaseCategoryName'] = "USER" user_review['ProductName'] = product['ProductName'] user_review['TestUrl'] = product['TestUrl'] date = self.extract(review.xpath('.//span[@itemprop="datePublished"]/text()')) user_review['TestDateText'] = date_format(date, '%d/%m/%Y') user_review['SourceTestRating'] = self.extract(review.xpath('.//span[@itemprop="ratingValue"]/text()')) user_review['Author'] = self.extract(review.xpath('.//span[@itemprop="author"]/text()')) user_review['TestTitle'] = self.extract(review.xpath('.//div[@itemprop="name"]/text()')) user_review['TestSummary'] = self.extract_all(review.xpath('.//blockquote/text()')) yield user_review
def parse_product(self, response): reviews = response.xpath('//section[article[contains(@class,"review")]]') if reviews: product = ProductItem() product['TestUrl'] = response.url product['OriginalCategoryName'] = 'Cell Phones' product['ProductName'] = self.extract(response.xpath('//meta[@itemprop="name"]/@content')) pic_url = self.extract(response.xpath('//meta[@property="og:image"]/@content')) product['PicURL'] = get_full_url(response, pic_url) product['ProductManufacturer'] = self.extract(response.xpath('//meta[@itemprop="brand"]/@content')) yield product user_reviews = reviews.xpath('./article[@itemprop="review"]') for review in user_reviews: user_review = ReviewItem() user_review['DBaseCategoryName'] = "USER" user_review['ProductName'] = product['ProductName'] user_review['TestUrl'] = product['TestUrl'] date = self.extract(review.xpath('.//span[@class="time"]/text()')) user_review['TestDateText'] = date_format(date, '') user_review['SourceTestRating'] = self.extract(review.xpath('.//meta[@itemprop="ratingValue"]/@content')) user_review['Author'] = self.extract(review.xpath('.//span[@itemprop="author"]/text()')) user_review['TestPros'] = self.extract_all(review.xpath( './/div[contains(@class,"positives")]/text()'), '; ') user_review['TestCons'] = self.extract_all(review.xpath( './/div[contains(@class,"negatives")]/text()'), '; ') yield user_review pro_review_url = self.extract(reviews.xpath('./article[contains(@class,"expert")]/div/a/@href')) if pro_review_url: request = Request(url=get_full_url(response, pro_review_url), callback=self.parse_review) request.meta['product'] = product yield request
def review_item_from_article_json_ld(json_ld, _review=None, overwrite=False): review = _review if _review else ReviewItem() html_parser = HTMLParser() if overwrite or not review.get('TestSummary', ''): summary = json_ld.get('description', '') if summary: review['TestSummary'] = html_parser.unescape(summary).strip() if overwrite or not review.get('TestTitle', ''): title = json_ld.get('name', '') if not title: title = json_ld.get('headline', '') if title: review['TestTitle'] = html_parser.unescape(title).strip() if overwrite or not review.get('Author', ''): try: author_str = json_ld.get('author', {}).get('name', '') except: author_list = json_ld.get('author', []) author_str = ', '.join(a.get('name', '') for a in author_list) if author_str: review['Author'] = html_parser.unescape(author_str).strip() if overwrite or not review.get('TestDateText', ''): test_date_text = json_ld.get('datePublished', '') if test_date_text: test_date_text = date_format(test_date_text, '') review['TestDateText'] = test_date_text return review
def level_4(self, response): original_url = response.url pname = response.meta["ProductName"] test_url = response.meta["TestUrl"] json_string = response.body.replace('bv_1111_60234', '').strip('()') data = json.loads(json_string) results = data['BatchedResults']['q0']['Results'] try: for item in results: review = ReviewItem() review['DBaseCategoryName'] = "USER" review['ProductName'] = pname review['TestUrl'] = test_url review['source_internal_id'] = item['ProductId'] review['TestDateText'] = item['SubmissionTime'] if review['TestDateText']: review['TestDateText'] = date_format( review['TestDateText'], '') review['SourceTestRating'] = item['Rating'] review['SourceTestScale'] = '5' review['Author'] = item['UserNickname'] review['TestTitle'] = item['Title'] review['TestSummary'] = item['ReviewText'] review['TestPros'] = item['Pros'] review['TestCons'] = item['Cons'] yield review except: pass pass
def parse_product(self, response): product = ProductItem() product['TestUrl'] = response.url ocn = self.extract(response.xpath( '//script[@type="text/javascript"][contains(text(),"sectionValue")]/text()')) ocn_match = re.findall(r'sectionValue = "([^"]+)"', ocn) product['OriginalCategoryName'] = ocn_match[0] product['ProductName'] = self.extract(response.xpath('//h1/span[@itemprop="name"]/text()')) pic_url = self.extract(response.xpath('//ul/li[1]/img[@itemprop="image"]/@src')) if pic_url: pic_url = get_full_url(response, pic_url) product['PicURL'] = pic_url product['ProductManufacturer'] = 'HP' yield product mpn = self.extract_list(response.xpath('//span[@class="prodNum"]/text()')) if mpn: product_id = self.product_id(product) product_id['ID_kind'] = "MPN" product_id['ID_value'] = mpn[0] yield product_id reviews = response.xpath('//div[@itemprop="review"]') for review in reviews: user_review = ReviewItem() user_review['DBaseCategoryName'] = "USER" user_review['ProductName'] = product['ProductName'] user_review['TestUrl'] = product['TestUrl'] user_review['TestDateText'] = self.extract(review.xpath('./meta[@itemprop="datePublished"]/@content')) user_review['SourceTestRating'] = self.extract(review.xpath('.//span[@itemprop="ratingValue"]/text()')) user_review['Author'] = self.extract(review.xpath('.//span[@itemprop="author"]/text()')) user_review['TestTitle'] = self.extract(review.xpath('.//span[@itemprop="name"]/text()')) user_review['TestSummary'] = self.extract_all(review.xpath('.//span[@itemprop="description"]//text()')) yield user_review
def parse_reviews(self, response): reviews = response.xpath('//div[@class="caja-comentarios"]') for review in reviews: user_review = ReviewItem() user_review['DBaseCategoryName'] = "USER" user_review['ProductName'] = response.meta['product']['ProductName'] user_review['TestUrl'] = response.meta['product']['TestUrl'] user_review['source_internal_id'] = response.meta['product']['source_internal_id'] date = self.extract(review.xpath('./p/text()[2]')) user_review['TestDateText'] = date_format(date, '%d-%m-%Y') rates = self.extract_list(review.xpath('.//li[@class="current-rating"]')) scale = 0 rating = 0 for rate in rates: rate_match = re.findall(r'([\d.]+)/5', rate) rating += float(rate_match[0]) scale += 5 user_review['SourceTestRating'] = str(rating) user_review['SourceTestScale'] = str(scale) user_review['Author'] = self.extract(review.xpath('.//span[contains(@class,"nick")]/text()')) user_review['TestSummary'] = self.extract(review.xpath('.//div[@class="caja"]/text()[1]')) user_review['TestPros'] = self.extract(review.xpath( './/strong[contains(text(),"Ventajas")]/following-sibling::text()[1]')) user_review['TestCons'] = self.extract(review.xpath( './/strong[contains(text(),"Desventajas")]/following-sibling::text()[1]')) yield user_review
def parse_reviews(self, response): product = response.meta["product"] reviews = response.xpath('//ul[@id="reviews-list"]/li') next_page_xpath = "//div[@id='review-list']/div[@class='see-more-bar']//a/@href" for review in reviews: user_review = ReviewItem() user_review['DBaseCategoryName'] = "USER" user_review['ProductName'] = product['ProductName'] user_review['TestUrl'] = self.extract(review.xpath('.//h3/a/@href')) date = self.extract(review.xpath('.//meta[@itemprop="datePublished"]/@content')) if date: date = date[:10] user_review['TestDateText'] = date_format(date, "%Y-%m-%d") user_review['SourceTestRating'] = self.extract(review.xpath('.//span[@itemprop="reviewRating"]/@content')) if user_review['SourceTestRating']: user_review['SourceTestScale'] = 5 user_review['Author'] = self.extract(review.xpath('.//a[@class="user-link"]//text()')) user_review['TestTitle'] = self.extract(review.xpath('.//h3//text()')) user_review['TestSummary'] = self.extract_all( review.xpath('.//div[@class="review-text"]//span/span/text()')) user_review['TestPros'] = self.extract_all( review.xpath(".//p[contains(@class, 'label-cons')]/following::p[1][not(text()='-')]/text()")) user_review['TestCons'] = self.extract_all( review.xpath(".//p[contains(@class,'label-pros')]/following::p[1][not(text()='-')]/text()")) yield user_review next_page_url = self.extract(response.xpath(next_page_xpath)) if next_page_url: request = Request(url=next_page_url, callback=self.parse_reviews) request.meta['product'] = product yield request
def parse_pro(self, response): item = response.meta['item'] pro_review = response.xpath('//div[@id="besteproducttest"]') rate_xpath = './/div[@class="block"]/div[contains(@class,"bp-review__intro__score")]//text()' if pro_review: item['has_review'] = 1 review = ReviewItem() review['DBaseCategoryName'] = "PRO" review['ProductName'] = item['name'] review['TestUrl'] = response.url date = self.extract(pro_review.xpath('.//@datetime')) review['TestDateText'] = date_format(date, '') review['SourceTestRating'] = self.extract( pro_review.xpath(rate_xpath)).replace(",", ".") review['Author'] = self.extract( pro_review.xpath('.//div[@class="avatar__title"]/text()')) review['TestTitle'] = self.extract( pro_review.xpath('.//h1/text()')) review['TestSummary'] = self.extract_all( pro_review.xpath('.//p/text()')) yield review request = Request(url=item['url'] + '/gebruikersreviews', callback=self.parse_user) request.meta['item'] = item yield request
def parse_product_review(self, response): print " ...PARSE_PRODUCT_REVIEW: " + response.url date = self.get_date(response) if date > self.stored_last_date: # REVIEW ITEM ---------------------------------------------------- review = ReviewItem() # 'TestTitle' test_title_xpath = '//div[@class="subheadtest"]/h4//text()' test_title = response.xpath(test_title_xpath).getall() test_title = " ".join(test_title) review['TestTitle'] = test_title # 'ProductName' product_name = \ self.get_product_name_based_on_title(review['TestTitle']) review['ProductName'] = product_name # 'TestSummary' summary_xpath = '//div[@class="csc-textpic-text"]/*//text()' summary = response.xpath(summary_xpath).getall() summary = " ".join(summary) review['TestSummary'] = summary # 'TestDateText' review['TestDateText'] = date.strftime("%Y-%m-%d") # 'DBaseCategoryName' review['DBaseCategoryName'] = 'PRO' # 'source_internal_id' sid = response.url.split('.0.html')[0] sid = sid.split('/')[-1] sid = sid.split('.')[-1] review['source_internal_id'] = sid # 'TestUrl' review['TestUrl'] = response.url # ---------------------------------------------------------------- # PRODUCT ITEM --------------------------------------------------- product = ProductItem() product['source_internal_id'] = review['source_internal_id'] product['ProductName'] = review['ProductName'] product['PicURL'] = response.meta.get('pic_url') product['TestUrl'] = response.url # ---------------------------------------------------------------- yield review yield product # In case this is the last review of the page if response.meta.get('check_next_page'): yield response.follow(url=response.meta.get('next_page_url'), callback=self.parse)
def parse_review(self, response): next_page_xpath = "(//*[@rel='next']/@href)[1]" default_rating_xpath = './/reevoo-score/@data-score' product = response.meta['product'] reviews = response.xpath('//article[contains(@id,"review_")]') if not reviews: return # From observation, at least currys.co.uk uses a different format to present review rating rating_xpath = response.meta.get('rating_xpath', '') if not rating_xpath: rating_xpath = default_rating_xpath last_user_review = incremental_utils.get_latest_user_review_date_by_sii( self.mysql_manager, self.spider_conf['source_id'], product["source_internal_id"]) for review in reviews: user_review = ReviewItem() date = self.extract( review.xpath( './/span[contains(@class, "date_publish")]/text()')) if date: user_review['TestDateText'] = date_format(date, '') current_user_review = dateparser.parse( user_review['TestDateText'], date_formats=['%Y-%m-%d']) if current_user_review < last_user_review: return user_review['DBaseCategoryName'] = "USER" user_review['ProductName'] = product['ProductName'] user_review['TestUrl'] = product['TestUrl'] user_review['SourceTestRating'] = self.extract( review.xpath(rating_xpath)) user_review['Author'] = self.extract( review.xpath('.//h4[@class="attribution-name"]/text()')) user_review['TestPros'] = self.extract_all( review.xpath('.//dd[@class="pros"]/text()')) user_review['TestCons'] = self.extract_all( review.xpath('.//dd[@class="cons"]/text()')) user_review['source_internal_id'] = product['source_internal_id'] # All reviews after first empty review are empty if user_review['TestPros'] or user_review['TestCons']: yield user_review else: return next_page_url = self.extract(response.xpath(next_page_xpath)) if next_page_url: next_page_url = get_full_url(response, next_page_url) request = Request(next_page_url, callback=self.parse_review, meta=response.meta) yield request
def test_review(self): review = ReviewItem() review['source_id'] = 19827398 review['source_internal_id'] = "FakeID" review['ProductName'] = "Fake Product Name" review['SourceTestRating'] = "9 million" review['SourceTestScale'] = "10" review['TestDateText'] = "29/02/2000" review['TestPros'] = "Shiny" review['TestCons'] = "Doesn't work" review['TestSummary'] = "BUY THEM ALL" review['TestVerdict'] = "BUY" review['Author'] = "Steve" review['DbaseCategoryName'] = "Fake Items" review['TestTitle'] = "An amazingly shiny thing I bought" review[ 'TestUrl'] = "http://awesomejunk.com/shinythings/fake_shiny_thing.html" review['Pay'] = "Maybe" review['award'] = "AWESOME" review['AwardPic'] = "http://somewhere.else.com/pic.png" review['countries'] = "ALL OF THEM" assert review._name == "review", "ReviewItem _name incorrect" assert review[ 'source_id'] == 19827398, "ReviewItem source_id incorrectly set" assert review[ 'source_internal_id'] == "FakeID", "ReviewItem source_internal_id incorrectly set" assert review[ 'ProductName'] == "Fake Product Name", "ReviewItem ProductName incorrectly set" assert review[ 'SourceTestRating'] == "9 million", "ReviewItem SourceTestRating incorrectly set" assert review[ 'SourceTestScale'] == "10", "ReviewItem SourceTestScale incorrectly set" assert review[ 'TestDateText'] == "29/02/2000", "ReviewItem TestDateText incorrectly set" assert review[ 'TestPros'] == "Shiny", "ReviewItem TestPros incorrectly set" assert review[ 'TestCons'] == "Doesn't work", "ReviewItem TestCons incorrectly set" assert review[ 'TestSummary'] == "BUY THEM ALL", "ReviewItem TestSummary incorrectly set" assert review[ 'TestVerdict'] == "BUY", "ReviewItem TestVerdict incorrectly set" assert review['Author'] == "Steve", "ReviewItem Author incorrectly set" assert review[ 'DbaseCategoryName'] == "Fake Items", "ReviewItem DbaseCategoryName incorrectly set" assert review[ 'TestTitle'] == "An amazingly shiny thing I bought", "ReviewItem TestTitle incorrectly set" assert review['TestUrl'] == "http://awesomejunk.com/shinythings/fake_shiny_thing.html", \ "ReviewItem TestUrl incorrectly set" assert review['Pay'] == "Maybe", "ReviewItem Pay incorrectly set" assert review['award'] == "AWESOME", "ReviewItem award incorrectly set" assert review[ 'AwardPic'] == "http://somewhere.else.com/pic.png", "ReviewItem AwardPic incorrectly set" assert review[ 'countries'] == "ALL OF THEM", "ReviewItem countries incorrectly set"
def _parse_review(self, product, review_selector, extra_review_parser=None): review = ReviewItem() date_xpath = './/meta[@itemprop="datePublished"]/@content' alt_date_xpath = './/*[contains(@class,"BVRRReviewDate")]/span[@class="value-title"]/@title' author_xpath = './/*[contains(@class,"BVRRNickname")]/text()|.//meta[@itemprop="author"]/@content' rating_xpath = './/*[contains(@class,"BVRRRatingOverall")]//*[contains(@class,"BVRRRatingNumber")]/text()' scale_xpath = './/*[contains(@class,"BVRRRatingOverall")]//*[contains(@class,"BVRRRatingRangeNumber")]//text()' pros_xpath = './/*[contains(@class,"BVRRReviewProTags") and contains(@class,"BVRRValue")]//text()' alt_pros_xpath = './/*[contains(@class,"BVRRTagsPrefix") and contains(text(),"Pro")]/following-sibling::*[contains(@class, "BVRRTags")][1]//text()' cons_xpath = './/*[contains(@class,"BVRRReviewConTags") and contains(@class,"BVRRValue")]//text()' alt_cons_xpath = './/*[contains(@class,"BVRRTagsPrefix") and contains(text(),"Cons")]/following-sibling::*[contains(@class, "BVRRTags")][1]//text()' summary_xpath = './/*[contains(@class,"BVRRReviewText")]//text()' title_xpath = './/*[contains(@class,"BVRRReviewTitle")]/text()' review['DBaseCategoryName'] = 'USER' if 'source_internal_id' in product: review['source_internal_id'] = product['source_internal_id'] review['ProductName'] = product['ProductName'] review['TestUrl'] = product['TestUrl'] review['TestDateText'] = self.extract( review_selector.xpath(date_xpath)) if not review['TestDateText']: review['TestDateText'] = self.extract( review_selector.xpath(alt_date_xpath)) review['Author'] = self.extract(review_selector.xpath(author_xpath)) review['SourceTestRating'] = self.extract( review_selector.xpath(rating_xpath)) if review['SourceTestRating']: review['SourceTestScale'] = self.extract( review_selector.xpath(scale_xpath)) review['TestPros'] = self.extract_all( review_selector.xpath(pros_xpath)) if not review['TestPros']: review['TestPros'] = self.extract_all( review_selector.xpath(alt_pros_xpath)) review['TestCons'] = self.extract_all( review_selector.xpath(cons_xpath)) if not review['TestCons']: review['TestCons'] = self.extract_all( review_selector.xpath(alt_cons_xpath)) review['TestSummary'] = self.extract_all( review_selector.xpath(summary_xpath)) review['TestTitle'] = self.extract_all( review_selector.xpath(title_xpath)) if extra_review_parser: try: altered_review = extra_review_parser(review_selector, review) return altered_review except: pass return review