def parse_review(self, response): review_json_ld = extruct_helper.extract_json_ld( response.text, "Review") article_json_ld = extruct_helper.extract_json_ld( response.text, "NewsArticle") if review_json_ld: review = extruct_helper.review_item_from_review_json_ld( review_json_ld) elif article_json_ld: review = extruct_helper.review_item_from_article_json_ld( article_json_ld) else: review = ReviewItem() review['DBaseCategoryName'] = 'PRO' if not review.get('TestUrl', ''): review['TestUrl'] = response.url review['ProductName'] = self.extract( response.xpath( "//div[@class='productDataBlock']/ul/li[1]/strong/text()")) if not review.get('ProductName', ''): review['ProductName'] = self.get_product_name(response) source_internal_id = str(response).split("/")[4] review['source_internal_id'] = source_internal_id.rstrip('>') review['TestPros'] = self.extract( response.xpath("//div[@id='ahReviewPros']/ul/li/text()")) review['TestCons'] = self.extract( response.xpath("//div[@id='ahReviewCons']/ul/li/text()")) return review
def parse_product(self, response): # The category names extracted in category pages are not very detailed, # extract it in product page instead category = '' category_json_ld = extruct_helper.extract_json_ld(response.body, 'BreadcrumbList') if category_json_ld: category = extruct_helper.category_item_from_breadcrumbs_json_ld(category_json_ld) yield category if self.should_skip_category(category): return # TODO: retry if we fail to get JSON-LD? sku = '' product_json_ld = extruct_helper.extract_json_ld(response.body, 'Product') if product_json_ld: product = extruct_helper.product_item_from_product_json_ld(product_json_ld) sku = product_json_ld.get('sku', None) else: # Not sure why we fail to extract JSON-LD from some pages, it will be good if we can figure out later product_xpaths = {"PicURL": "(//*[@property='og:image'])[1]/@content", "ProductName": "//h1[contains(@class, 'page-title')]/span//text()", "ProductManufacturer": "//h1[contains(@class,'page-title')]/span[1]/text()" } product = self.init_item_by_xpaths(response, "product", product_xpaths) if not sku: sku_xpath = "//p[@class='prd-code']/text()" sku = self.extract(response.xpath(sku_xpath)) if sku: splitted = sku.split(': ') if splitted: sku = splitted[-1] product['TestUrl'] = response.url if category: product['OriginalCategoryName'] = category['category_path'] if sku: product['source_internal_id'] = sku product_id = self.product_id(product=product, kind='currys_internal_id', value=sku) yield product_id yield product reevoo_review_id = '' match = re.search(self.reevoo_review_id_re, response.url) if match: reevoo_review_id = match.group(1) if reevoo_review_id: # TODO: test if the url is valid or not? review_url = self.review_url_format.format(reevoo_review_id) request = Request(url=review_url, callback=self.parse_review) request.meta['product'] = product request.meta['rating_xpath'] = ".//div[@class='overall_score_stars']/@title" yield request
def parse_review(self, response): category_json_ld = extruct_helper.extract_json_ld( response.body, 'BreadcrumbList') review_xpaths = { "SourceTestRating": "(//span[contains(@class,'rating')]/@title)[1]", "TestPros": "(//ul[contains(@class,'plusmin-list')])[1]" "//li[contains(@class,'plusmin-item')]//text()", "TestCons": "(//ul[contains(@class,'plusmin-list')])[2]" "//li[contains(@class,'plusmin-item')]//text()" } review = self.init_item_by_xpaths(response, "review", review_xpaths) review_json_ld = extruct_helper.extract_json_ld \ (response.body, 'Review') if review_json_ld: review = extruct_helper.review_item_from_article_json_ld( review_json_ld, review) # different scale based rating system if not review.get('SourceTestRating', ''): rating_xpath = "//span[contains(@class, 'starrating')]/span[contains(@class, 'value-title')]/text()" rating_str = self.extract(response.xpath(rating_xpath)) # new rating scale at 100 rating_ratio = 100 / 5 try: if rating_str: rating_unified_str = (float(rating_str)) / rating_ratio review["SourceTestRating"] = rating_unified_str except ValueError, e: print(e) print('rating_str is: {}').format(rating_str)
def parse_product(self, response): category = response.meta['category'] review_url_xpath = "//div[@class='product-page--title-links']//a[@class='review-rating--reviews-link']/@href" match = re.search(self.product_url_re, response.url) if match: source_internal_id = match.group(1) else: self.logger.error('Failed to get source internal id for product at: {}'.format(response.url)) return json_ld = extruct_helper.extract_json_ld(response.text, 'Product') if not json_ld: request = self._retry(response.request) yield request return product = extruct_helper.product_item_from_product_json_ld(json_ld) product['TestUrl'] = response.url product['source_internal_id'] = source_internal_id product['OriginalCategoryName'] = category['category_path'] yield product review_url = self.extract_xpath(response, review_url_xpath) if review_url: review_url = get_full_url(response, review_url) + '?sorteer=date%20desc' request = Request(review_url, callback=self.parse_reviews) request.meta['product'] = product yield request
def parse_product(self, response): product_xpaths = { 'PicURL': '//a[@class="lb-show"]/@href', 'ProductName': '//div[@class="productDataBlock"]/ul/li[1]/strong/text()', 'ProductManufacturer': '//div[@class="productDataBlock item"]/' 'ul/li[contains(text(), "Manufacturer")]/strong/span/text()', } product = self.init_item_by_xpaths(response, "product", product_xpaths) if not product.get('ProductName', ''): product['ProductName'] = self.get_product_name(response) source_internal_id = str(response).split("/")[4] product['source_internal_id'] = source_internal_id.rstrip('>') breadcrumb_json_ld = extruct_helper.extract_json_ld( response.text, "BreadcrumbList") if breadcrumb_json_ld: items = breadcrumb_json_ld.get('itemListElement', None) if items and len(items) > 1: product['OriginalCategoryName'] = items[1].get('item', {}).get( 'name', '') return product
def parse_product(self, response): category = response.meta['category'] product = ProductItem() product['TestUrl'] = response.url product['OriginalCategoryName'] = category['category_path'] product['ProductName'] = '' product['PicURL'] = '' product_json_ld = extruct_helper.extract_json_ld( response.text, 'Product') if product_json_ld: product['ProductName'] = product_json_ld.get('name', '') product['PicURL'] = product_json_ld.get('image', '') else: # TODO: add fallback plan? return parsed_url = urlparse(response.url) splited = parsed_url.path.split('/') if splited: product["source_internal_id"] = splited[-1] yield product internal_id = self.product_id(product, kind='reevoo_internal_id', value=product['source_internal_id']) yield internal_id # TODO: test if the url is valid or not? review_url = self.review_url_format.format( product["source_internal_id"]) request = Request(review_url, callback=self.parse_review) request.meta['product'] = product yield request
def parse_category(self, response): products_xpath = "//div[@data-component='product-list-view']/article/div[@class='desc']" next_page_xpath = "//a[@class='next']/@href" product_url_xpath = "./a/@href" has_review_xpath = ".//*[contains(@class, 'reevoo-score')]" products = response.xpath(products_xpath) if not products: return # This category may be too general, but it helps if we know it can be skipped category_json_ld = extruct_helper.extract_json_ld(response.body, 'BreadcrumbList') if category_json_ld: category = extruct_helper.category_item_from_breadcrumbs_json_ld(category_json_ld) yield category if self.should_skip_category(category): return for product in products: has_review = product.xpath(has_review_xpath) if not has_review: continue product_url = self.extract(product.xpath(product_url_xpath)) request = Request(url=get_full_url(response, product_url), callback=self.parse_product) yield request next_page_url = self.extract(response.xpath(next_page_xpath)) if next_page_url: next_page_url = get_full_url(response, next_page_url) request = Request(next_page_url, callback=self.parse_category) yield request
def parse_reviews(self, response): product = response.meta['product'] review = response.meta['review'] product['TestUrl'] = response.url review['TestVerdict'] = self.extract_all(response.xpath( '//h4[contains(text(),"Wrap") or contains(text(),"Conclusion")]/following-sibling::p//text()' ), separator=" ") if not review['TestVerdict']: review['TestVerdict'] = self.extract_all(response.xpath( '//h3[contains(text(),"Wrap") or contains(text(),"Conclusion")]/following-sibling::p//text()' ), separator=" ") review['DBaseCategoryName'] = "PRO" review['TestUrl'] = response.url review['TestPros'] = self.extract_all(response.xpath( "//div[contains(@class, 'review-pros')]//li/text()"), separator=' ; ') review['TestCons'] = self.extract_all(response.xpath( "//div[contains(@class, 'review-cons')]//li/text()"), separator=' ; ') review_json_ld = extruct_helper.extract_json_ld( response.text, 'Review') if review_json_ld: review = extruct_helper.review_item_from_review_json_ld( review_json_ld, review) yield product yield review
def parse_review(self, response): review = ReviewItem() # Parsing using XPath xpaths = { 'TestSummary': '//meta[@property="og:description"]/@content', 'TestPros': '//*[@class="rs-review--positives"]//span/text()', 'TestCons': '//*[@class="rs-review--negatives"]//span/text()', 'source_internal_id': '//div[@data-widget="article-edit"]/@data-meta', 'ProductName': '//section/header/h1/text()', } # Extract data = {} for key in xpaths: data[key] = response.xpath(xpaths[key]).extract() # Process if (len(data['source_internal_id']) > 0): data['source_internal_id'] = json.loads( data['source_internal_id'][0]).get('id') data['TestPros'] = ';'.join(data['TestPros']) data['TestCons'] = ';'.join(data['TestCons']) data['TestSummary'] = data['TestSummary'][0] data['ProductName'] = data['ProductName'][0] for key in xpaths: review[key] = data[key] # Parsing using JSON-LD # Populates: # Author, SourceTestRating, SourceTestScale, TestDateText, TestTitle review_json_ld = extruct_helper.extract_json_ld( response.text, 'Review') if review_json_ld: review = extruct_helper.review_item_from_review_json_ld( review_json_ld, review) review['TestUrl'] = response.url review['source_id'] = self.spider_conf["source_id"] review['DBaseCategoryName'] = 'PRO' # There are some occurences of "null" in the TestTile and TestSummary if review['TestTitle'] == 'null': review['TestTitle'] = review['ProductName'] if review['TestSummary'] == 'null': review['TestSummary'] = '' return review
def test_review_item_from_review_json_ld_default_best_rating(self): html_text = '''<script type="application/ld+json"> { "@context":"http://schema.org/", "@type":"Review", "itemReviewed":{"@type":"Product","name":"OnePlus 5"}, "reviewRating":{"@type":"Rating","ratingValue":5} } </script>''' json_ld = extruct_helper.extract_json_ld(html_text, 'Review') review = extruct_helper.review_item_from_review_json_ld(json_ld) self.assertIsNotNone(review) self.assertEqual(int(review['SourceTestScale']), 5)
def parse_review(self, response): print('2. got to the parse_review page with {}').format(response.url) review = ReviewItem() category = CategoryItem() category_json_ld = extruct_helper.extract_json_ld( response.body, 'BreadcrumbList') if not category_json_ld: print('no category can be found') return category = extruct_helper.category_item_from_breadcrumbs_json_ld( category_json_ld) if self.should_skip_category(category): return yield category product_xpaths = { "PicURL": "//meta[@property='og:image']/@content", #"ProductName": "//meta[@property='og:title']/@content" } product = self.init_item_by_xpaths(response, "product", product_xpaths) if category: CategoryName = category['category_path'] product["OriginalCategoryName"] = CategoryName.replace\ (' | Testbericht', '') source_internal_id_re = r'Review_([0-9]+)' source_internal_id_xpath = "//article[contains(@id,'Review')]/@id" product["source_internal_id"] = response.xpath\ (source_internal_id_xpath).re_first(source_internal_id_re) product_name_re = r'.de/(.*),testberichte' product_name_xpath = "//meta[@property='og:url']/@content" product_name = response.xpath(product_name_xpath).re_first\ (product_name_re) product["ProductName"] = product_name.replace('-', ' ') yield product review_json_ld = extruct_helper.extract_json_ld\ (response.body, 'Article') if review_json_ld: review = extruct_helper.review_item_from_article_json_ld( review_json_ld, review) review["ProductName"] = product["ProductName"] review["DBaseCategoryName"] = "PRO" review["TestUrl"] = product["TestUrl"] review["source_internal_id"] = product["source_internal_id"] yield review
def parse_product_json(self, response): product_json_ld = extruct_helper.extract_json_ld( response.body, 'Product') if product_json_ld: ocns = product_json_ld.get('category', '') if ocns: seperator = '/' ocns = ocns.split(seperator) ocn = ' | '.join(ocn for ocn in ocns) category = CategoryItem() category['category_path'] = ocn yield category if not self.should_skip_category(category): product = extruct_helper.product_item_from_product_json_ld( product_json_ld) product['source_id'] = self.spider_conf['source_id'] product['TestUrl'] = response.url product['source_internal_id'] = product_json_ld.get( 'productID', '') product['OriginalCategoryName'] = ocn yield product # Product Price Item # ---------------------------------------- price_str = product_json_ld.get('offers', {}).get('price', '') currency_str = product_json_ld.get('offers', {}).get( 'priceCurrency', '') price_str = price_str + ' ' + currency_str yield ProductIdItem.from_product(product, kind='price', value=price_str) # Product SKU Item # ---------------------------------------- sku_str = product_json_ld.get('sku', '') yield ProductIdItem.from_product(product, kind='SKU', value=sku_str)
def test_review_item_from_review_json_ld_full_review(self): html_text = '''<script type="application/ld+json"> { "@context": "http://schema.org/", "@type": "Review", "itemReviewed": { "@type": "Product", "name": "OnePlus 5" }, "author": { "@type": "Person", "name": "Joe" }, "reviewRating": { "@type": "Rating", "ratingValue": "7", "bestRating": "10" }, "publisher": { "@type": "Organization", "name": "CNET" }, "datePublished":"2017-08-07", "headline":"OnePlus 5 review", "description":"The OnePlus 5 is one of the best phones you can buy today" } </script>''' json_ld = extruct_helper.extract_json_ld(html_text, 'Review') review = extruct_helper.review_item_from_review_json_ld(json_ld) self.assertIsNotNone(review) self.assertEqual(review['ProductName'], 'OnePlus 5') self.assertEqual(review['Author'], 'Joe') self.assertEquals(review['TestDateText'], '2017-08-07') self.assertEqual(int(review['SourceTestRating']), 7) self.assertEqual(int(review['SourceTestScale']), 10) self.assertEqual(review['TestTitle'], 'OnePlus 5 review') self.assertEqual( review['TestSummary'], 'The OnePlus 5 is one of the best phones you can buy today')
def parse_review(self, response): # TODO verdict not found and source_id not found product_xpath = {"PicURL": "//*[@property='og:image']/@content"} review_xpaths = { "TestSummary": "//*[@property='og:description']/@content", "TestPros": "//div[@id='wired-tired']//p[1]/text()", "TestCons": "//div[@id='wired-tired']//p[2]/text()", "TestDateText": "(//meta[@itemprop='datePublished'])[1]/@content", } product = self.init_item_by_xpaths(response, "product", product_xpath) review = self.init_item_by_xpaths(response, "review", review_xpaths) # utilize structured data # -------------------------------------------------- # get review from structured data 'Review' review_json_ld = extruct_helper.extract_json_ld( response.text, 'Review') if review_json_ld: review = extruct_helper.review_item_from_review_json_ld( review_json_ld, review) # get title from structure data 'NewsArticle', or Product # ------------------------------------------------------- # wired.com use the format #'Review: [product name] | wired'as title # most of the time title = '' news_article_json_ld = extruct_helper.extract_json_ld( response.text, 'NewsArticle') product_json_ld = extruct_helper.extract_json_ld( response.text, 'Product') if news_article_json_ld: title = news_article_json_ld.get('headline').strip() review['TestTitle'] = title elif product_json_ld: title = product_json_ld.get('name').strip() review['TestTitle'] = title # double check product name # -------------------------------------------------- product_name = review.get('ProductName') if not product_name: if title.startswith('Review:'): PRODUCT_INDEX = 1 product_name = title.split(':')[PRODUCT_INDEX].strip() else: product_name = title.split('Review')[0].strip() # get rid of the the last part of 'product_name | wired' if '|' in product_name: product_name = product_name.split('|')[0].strip() review['ProductName'] = product_name product['ProductName'] = product_name # double check date # -------------------------------------------------- date = review['TestDateText'] if not date: date_xpath = "//meta[@name='parsely-pub-date']/@content" date = self.extract(response.xpath(date_xpath)) review['TestDateText'] = date_format(date, '') # double check author # -------------------------------------------------- author = review.get('Author', '') if not author: author_xpath = "//span[@itemprop='author']/a/text()" author = self.extract(response.xpath(author_xpath)) if author: review['Author'] = author # parse category using tags category = self.get_categories_from_tags(response) if category: yield category if self.should_skip_category(category): return product['OriginalCategoryName'] = category['category_path'] # double check PicURL for product # -------------------------------------------------- pic_url = product.get('PicURL') if not pic_url: pic_url_xpath = "(//div[contains(@class, 'gallery-pic')]//img)[1]/@src" pic_url = self.extract_xpath(response, pic_url_xpath) if pic_url: product['PicURL'] = pic_url yield product # double check review rating # -------------------------------------------------- rating_value = review.get('SourceTestRating') if not rating_value: rating_text_xpath = "//h3[contains(text(), 'RATING')]/following-sibling::p//text()" rating_text = self.extract_xpath(response, rating_text_xpath) rating_re = r'([0-9]+)' if rating_text: rating_match = re.search(rating_re, rating_text) if rating_match: rating = rating_match.group(0) review['SourceTestRating'] = rating REVIEW_SCALE = unicode('10') review['SourceTestScale'] = REVIEW_SCALE review["DBaseCategoryName"] = "PRO" yield review
def parse_review(self, response): review_xpaths = { "TestTitle": "//*[@property='og:title']/@content", "TestSummary": "//*[@property='og:description']/@content", "TestVerdict": "//section[@class='review-body']//*[contains(text(),'Conclusion')]/ancestor::p//text()", "TestPros":"//div[@class='pros-cons-bl']//*[contains(text(),'Pros')]//parent::li//p[@class='summary']//text()", "TestCons":"//div[@class='pros-cons-bl']//*[contains(text(),'Cons')]//parent::li//p[@class='summary']//text()", } product_name_xpath = "//h1[contains(@class,'item')]/text()" internal_id_xpath = "//meta[@name='article-id']/@content" award_xpath = "//div[@class='editors-logo']/img/@src" product = response.meta['product'] review = self.init_item_by_xpaths(response, "review", review_xpaths) # get category category = CategoryItem() breadcrumbs_json_ld = extruct_helper.extract_json_ld(response.text, 'BreadcrumbList') if breadcrumbs_json_ld: category = extruct_helper.leaf_category_item_from_breadcrumbs_json_ld(breadcrumbs_json_ld, category) yield category if self.should_skip_category(category): return category_name = category['category_path'] product["OriginalCategoryName"] = category_name product['TestUrl'] = response.url review['TestUrl'] = product['TestUrl'] review_json_ld = extruct_helper.extract_json_ld(response.text, 'Review') if review_json_ld: review = extruct_helper.review_item_from_review_json_ld(review_json_ld, _review=review) product['ProductName'] = review['ProductName'] else: product['ProductName'] = self.extract(response.xpath(product_name_xpath)) review['ProductName'] = product['ProductName'] if review.get("TestDateText", ''): review["TestDateText"] = date_format(review["TestDateText"], "%Y-%m-%dT%H:%M:%S") alt_verdict_xpath = "string(//section[@class='review-body']//*[contains(text(),'Conclusion')]/following::p[ string-length(.//text()) > 0 ][1])" alt_verdict_xpath2 = "string(//div[contains(@class, 'article-footer')]/preceding::p[ string-length(.//text()) > 0 ][1])" if not review['TestVerdict']: review['TestVerdict'] = self.extract_all(response.xpath(alt_verdict_xpath)) if not review['TestVerdict']: review['TestVerdict'] = self.extract_all(response.xpath(alt_verdict_xpath2)) internal_id = self.extract(response.xpath(internal_id_xpath)) if internal_id: product['source_internal_id'] = internal_id review['source_internal_id'] = internal_id product_id_item = self.product_id(product, kind='pcmag_internal_id', value=internal_id) yield product_id_item ec_award_url = self.extract(response.xpath(award_xpath)) if ec_award_url: review['AwardPic'] = get_full_url(response, ec_award_url) review['award'] = "Editor's Choice" review["DBaseCategoryName"] = "PRO" review["SourceTestScale"] = "5" yield product yield review
def parse_review(self, response): product_xpaths = {"PicURL": "//*[@property='og:image']/@content"} review_xpaths = { "TestTitle": "//*[@property='og:title']/@content", "TestSummary": '//meta[@property="og:description"]/@content', "TestVerdict": '//a[@id="conclusion"]/following::p[1]/text()', "TestPros": '//div[@class="iconProText"]/text()', "TestCons": '//div[@class="iconConText"]/text()', 'Author': '//div[@class="small"]/a/text()', } product_name = '' product = self.init_item_by_xpaths(response, "product", product_xpaths) review = self.init_item_by_xpaths(response, "review", review_xpaths) # utilize structured data review_json_ld = extruct_helper.extract_json_ld( response.text, 'Review') if review_json_ld: review = extruct_helper.review_item_from_review_json_ld( review_json_ld, review) product_name = review_json_ld.get('itemReviewed', {}).get('name', '') product_name = product_name.split('review')[0].strip() # incremental if review.get('TestDateText', ''): review['TestDateText'] = date_format(review['TestDateText'], '') else: test_date_xpath = '//div[@class="small"][2]/text()' test_date = self.extract(response.xpath(test_date_xpath)) test_date = parse(test_date) test_date = test_date.strftime("%Y-%m-%d") review['TestDateText'] = test_date if not product_name: title_xpath = "//h1/text()" title = self.extract(response.xpath(title_xpath)) if title: product_name = title.split('review')[0].strip() product['ProductName'] = product_name review['ProductName'] = product['ProductName'] category_path_xpath = "//span[@itemprop='name']/text()" all_category_names = self.extract_all( response.xpath(category_path_xpath), separator=' | ') product['OriginalCategoryName'] = all_category_names source_int_id = response.url source_int_id = source_int_id.split('/')[-1] product['source_internal_id'] = source_int_id review['source_internal_id'] = source_int_id if product.get('OriginalCategoryName', ''): category = CategoryItem() category['category_path'] = product['OriginalCategoryName'] yield category yield product award_xpath = "//td/div[2]/img[contains(@alt, 'Award')]" award = response.xpath(award_xpath) if award: award_name = self.extract_xpath(award, './@alt') award_image_url = self.extract_xpath(award, './@src') if award_name and award_image_url: review['award'] = 'TechGearLab ' + award_name review['AwardPic'] = award_image_url review["DBaseCategoryName"] = "PRO" yield review
def parse_review(self, response): category_path_xpath = "(//div[@class='popular_groups']//li/a)[1]/text()" category = CategoryItem() category['category_path'] = self.extract( response.xpath(category_path_xpath)) if self.should_skip_category(category): return yield category microdata_items = extruct_helper.get_microdata_extruct_items( response.text) if not microdata_items: return source_internal_id_re = r'/review/[^/]+/' source_internal_id = '' match = re.search(source_internal_id_re, response.url) if match: source_internal_id = match.group(1) product = ProductItem.from_response( response, category, source_internal_id=source_internal_id) review = list( extruct_helper.get_reviews_microdata_extruct(microdata_items, product, review_type='PRO')) if len(review) > 1: self.logger.error( 'Found more than 1 reviews in {0} through microdata'.format( response.url)) return review = review[0] print product print review return product_xpaths = { "ProductName": "//h1[@itemprop='headline']/text()", "PicURL": "//meta[@property='og:image']/@content", } product = self.init_item_by_xpaths(response, "product", product_xpaths) product['TestUrl'] = response.url picurl = product.get("PicURL", "") if picurl and picurl[:2] == "//": product["PicURL"] = "https:" + product["PicURL"] if picurl and picurl[:1] == "/": product["PicURL"] = get_full_url(response.url, picurl) product['OriginalCategoryName'] = category['category_path'] review_xpaths = { "TestTitle": "//*[@property='og:title']/@content", "TestPros": "//div[div[text()='The good'] or span[text()='The good']]//li/text()", "TestCons": "//div[div[text()='The bad'] or span[text()='The bad']]//li/text()", "TestSummary": "//h3[.//text() = 'Bottom Line' or .//text() = 'Bottom line' or .//text = 'Verdict']/" "following-sibling::*[ .//text()[normalize-space()] ][1]//text()", "TestVerdict": "//div[div[text()='Verdict'] or span[text()='Verdict']]//p/text()", } review = self.init_item_by_xpaths(response, "review", review_xpaths) review['TestUrl'] = response.url review_json_ld = extruct_helper.extract_json_ld( response.text, 'Review') if review_json_ld: review = extruct_helper.review_item_from_review_json_ld( review_json_ld, review) if not review.get('TestDateText'): review['TestDateText'] = self.extract( response.xpath("//meta[@itemprop='datePublished']/@content")) if review["TestDateText"]: review["TestDateText"] = review["TestDateText"].strip() review["TestDateText"] = date_format(review["TestDateText"], "%Y-%m-%d") if review.get('ProductName', ''): product['ProductName'] = review['ProductName'] else: title = review["TestTitle"].lower() if ":" in title: all_title_parts = title.split(":") for part in all_title_parts: review["ProductName"] = part.replace( "review", "") if 'review' in part else title.replace( "review", "") else: review["ProductName"] = title.replace("review", "") review["ProductName"] = review["ProductName"].strip("-: ") product["ProductName"] = review["ProductName"] internal_id_re = r',review-(.*)\.html' match = re.search(internal_id_re, response.url) if match: internal_id = match.group(1) product['source_internal_id'] = internal_id review['source_internal_id'] = internal_id product_id = self.product_id(product, kind='tomsguide_en_internal_id', value=internal_id) yield product_id alt_summary_xpath = "//div[@class='sbbl-content-text']/p//text()" if not review['TestSummary']: review['TestSummary'] = self.extract( response.xpath(alt_summary_xpath)) alt_verdict_xpath = "//div[div[text()='Verdict'] or span[text()='Verdict']]//div/text()" if not review['TestVerdict']: review['TestVerdict'] = self.extract( response.xpath(alt_verdict_xpath)) # only get summary from article description if both verdict and summary are empty, # or else summary and verdict may end up to be the same if not review['TestSummary'] and not review['TestVerdict']: review['TestSummary'] = self.extract( response.xpath("//meta[@name='description']/@content")) review["DBaseCategoryName"] = "PRO" ec_award_xpath = "//section[contains(@class, 'page-content-leftcol')]//div[@class='editor-pick']" ec_award = response.xpath(ec_award_xpath) if ec_award: review['award'] = "Editor's Choice" review[ 'AwardPic'] = "http://qa901.office.alatest.se/omt-award-images/tomsguide_en_editor_pick.png" yield product yield review
def parse_category(self, response): products_xpath = "//div[@class='Productlist']/div" product_sku_xpath = "./@data-sku" has_review_xpath = ".//span[@class='Rating-average']" next_page_xpath = "(//*[@rel='next'])[1]/@href" category = response.meta.get('category', '') if not category: # the category we get here is actually parent category category_json_ld = extruct_helper.extract_json_ld( response.text, 'BreadcrumbList') if not category_json_ld: request = self._retry(response.request) yield request return category = extruct_helper.category_item_from_breadcrumbs_json_ld( category_json_ld) current_category_name = self.extract( response.xpath( "//div[@id='breadcrumb']/ul/li[@class='pad-left']/text()")) if current_category_name.lower( ) != category['category_leaf'].lower(): category['category_leaf'] = current_category_name category['category_path'] = u'{} | {}'.format( category['category_path'], current_category_name) category['category_url'] = response.url yield category if self.should_skip_category(category): return products = response.xpath(products_xpath) # Not a leaf category page if not products: return # We skip the product page, as feelunique.com tries to block us if we access too many of their pages, # but it is impossible for them to block the access to Bazaarvoice API for product in products: has_review = product.xpath(has_review_xpath) if not has_review: continue product_sku = self.extract(product.xpath(product_sku_xpath)) if product_sku: product_id = ProductIdItem() product_id['source_internal_id'] = product_sku product_id['ID_kind'] = 'feelunique_internal_id' product_id['ID_value'] = product_sku yield product_id bv_params = self.bv_base_params.copy() bv_params['bv_id'] = product_sku bv_params['offset'] = 0 review_url = self.get_review_url(**bv_params) last_user_review = incremental_utils.get_latest_user_review_date_by_sii( self.mysql_manager, self.spider_conf['source_id'], product_sku) request = Request(review_url, callback=self.parse_reviews) request.meta['last_user_review'] = last_user_review request.meta['filter_other_sources'] = False request.meta['OriginalCategoryName'] = category[ 'category_path'] request.meta['bv_id'] = product_sku yield request else: product_url_xpath = "./a/@href" product_url = self.extract(response.xpath(product_url_xpath)) product_url = get_full_url(response, product_url) self.logger.info("Failed to get SKU for product at %s" % product_url) next_page_url = self.extract_xpath(response, next_page_xpath) if next_page_url: next_page_url = get_full_url(response, next_page_url) next_page_request = Request(next_page_url, callback=self.parse_category) next_page_request.meta['category'] = category yield next_page_request
def parse_review(self, response): product_xpaths = { 'PicURL': '//meta[@property="og:image"]/@content', 'OriginalCategoryName': '//div[@class="dennis-kicker"]/a/text()' } review_xpaths = { 'TestSummary': '//meta[@property="og:description"]/@content', 'TestPros': '//div[contains(@class, "field-name-field-pros")]' '/div[@class="field-items"]//text()', 'TestCons': '//div[contains(@class, "field-name-field-cons")]' '/div[@class="field-items"]//text()', 'TestDateText': '//span[@class="date-display-single"]/text()', 'Author': '//span[@class="field field-name-field-author ' 'field-type-node-reference field-label-hidden"]/' 'span[@class="field-item even"]/text() | //div[@class="field ' 'field-name-author-names-combined field-type-text ' 'field-label-hidden"]/div[@class="field-items"]/div' '[@class="field-item even"]/a' } product_name = '' product = self.init_item_by_xpaths(response, 'product', product_xpaths) review = self.init_item_by_xpaths(response, 'review', review_xpaths) title_xpath = '//meta[@property="og:title"]/@content' title = self.extract(response.xpath(title_xpath)) review_json_ld = extruct_helper.extract_json_ld( response.text, 'Review') if review_json_ld: review = extruct_helper.review_item_from_review_json_ld( review_json_ld, review) product_name = review_json_ld.get('itemReviewed', {}).get('name', '') product_name = product_name.split('review')[0].strip() # get review date and do incremental scraping if review.get('TestDateText', ''): review['TestDateText'] = date_format(review['TestDateText'], '') if not product_name: # title_xpath = '//meta[@property="og:title"]/@content' # title = self.extract(response.xpath(title_xpath)) product_name = title.split('review')[0].strip() product['ProductName'] = product_name review['ProductName'] = product['ProductName'] review['TestTitle'] = title # product['TestTitle'] = title category_url_xpath = '//div[contains(@class, '\ '"field-category-primary")]//a/@href' if product.get('OriginalCategoryName', ''): category = CategoryItem() category['category_leaf'] = product['OriginalCategoryName'] category['category_path'] = product['OriginalCategoryName'] category['category_url'] = get_full_url( response, self.extract(response.xpath(category_url_xpath))) yield category # award_xpath = '//div[contains(@class, "group-media")]//div[contains # (@class, "field-name-field-award-image")]//img/@src' # award = response.xpath(award_xpath) # if award: # award_re = r'(.*)\s+Logo' # award_name = award.xpath('./@title').re_first(award_re) # award_image_url = self.extract_xpath(award, './@src') # if award_name and award_image_url: # review['award'] = award_name # review['AwardPic'] = award_image_url internal_id = '' internal_id_url_xpath = '//meta[@property="og:url"]/@content' internal_id_re = r'go/([0-9]+)' internal_id_url = self.extract_xpath(response, internal_id_url_xpath) if internal_id_url: internal_id_match = re.search(internal_id_re, internal_id_url) if internal_id_match: internal_id = internal_id_match.group(1) else: internal_id = internal_id_url.split('/')[-2] if not internal_id or not internal_id.isdigit(): internal_id = response.url.split('/')[-2] if internal_id and internal_id.isdigit(): product_id = ProductIdItem() product_id['ProductName'] = product['ProductName'] product_id['source_internal_id'] = internal_id product_id['ID_kind'] = 'expertreviews_internal_id' product_id['ID_value'] = internal_id yield product_id product['source_internal_id'] = internal_id yield product review['DBaseCategoryName'] = 'PRO' review['SourceTestScale'] = '5' review['source_internal_id'] = product['source_internal_id'] verdict_page_xpath = '//section[@class="pagination mn_background"]'\ '//li[last()]/a/@href' verdict_page_url = self.extract(response.xpath(verdict_page_xpath)) if verdict_page_url: verdict_page_url = get_full_url(response, verdict_page_url) request = Request(verdict_page_url, callback=self.get_test_verdict) request.meta['review'] = review yield request else: test_verdict_xpath = '(//div[contains(@class, "field-name-body")]'\ '//p[ not(strong) and .//text()[normalize-space()]and'\ ' .//text()[not(starts-with(., "Buy"))]and '\ './/text()[not(starts-with(., "BUY"))] ])[last()]//text()' review["TestVerdict"] = self.extract_all( response.xpath(test_verdict_xpath), separator='', keep_whitespace=True) yield review
def parse_review(self, response): category = response.meta['category'] product_xpaths = { "source_internal_id": u"//div[@class='article_content']/descendant-or-self::*[./@data-product-id][1]/@data-product-id", "ProductName": u"normalize-space(//h1)", "PicURL": u"//meta[@property='og:image']/@content", } product = self.init_item_by_xpaths(response, "product", product_xpaths) product['ProductName'] = remove_suffix(product['ProductName'], ' Review') product['TestUrl'] = response.url product['OriginalCategoryName'] = category['category_path'] if product.get('PicURL', ''): product['PicURL'] = get_full_url(response, product['PicURL']) review_xpaths = { "TestDateText": u"//*[@itemprop='datePublished']/@content", "TestPros": u"//p[.//*[contains(text(),'Pros')]]/text()", "TestCons": u"//p[.//*[contains(text(),'Cons')]]/text()", "TestSummary": u"string(//h2[text()='Summary']/following::p)", "TestVerdict": u"//p[.//*[contains(text(),'Verdict')]]/text()", "TestTitle": u"normalize-space(//h1)", "award": u"(//a[contains(@alt, 'Award')])[1]/@alt", "AwardPic": u"(//a[contains(@alt, 'Award')])[1]/@data-bgset" } review = self.init_item_by_xpaths(response, "review", review_xpaths) review['TestUrl'] = response.url review_json_ld = extruct_helper.extract_json_ld( response.text, 'Review') if review_json_ld: review = extruct_helper.review_item_from_review_json_ld( review_json_ld, review) if review.get('ProductName'): product['ProductName'] = review['ProductName'] else: review['ProductName'] = product['ProductName'] awpic_link = review.get("AwardPic", "") if awpic_link: review["AwardPic"] = get_full_url(response, awpic_link) # Not a detailed review, can only get summary and verdict if not (review['TestSummary'] or review['TestVerdict'] or review['TestPros'] or review['TestCons']): summary_alt_xpath = "string(//section[@id='Intro']/p[1])" verdict_alt_xpath = "string(//section[@id='Intro']/p[last()])" review['TestSummary'] = self.extract( response.xpath(summary_alt_xpath)) review['TestVerdict'] = self.extract( response.xpath(verdict_alt_xpath)) review["DBaseCategoryName"] = "PRO" review["SourceTestScale"] = "10" yield product yield review