def parse_review(self, response): review_json_ld = extruct_helper.extract_json_ld( response.text, "Review") article_json_ld = extruct_helper.extract_json_ld( response.text, "NewsArticle") if review_json_ld: review = extruct_helper.review_item_from_review_json_ld( review_json_ld) elif article_json_ld: review = extruct_helper.review_item_from_article_json_ld( article_json_ld) else: review = ReviewItem() review['DBaseCategoryName'] = 'PRO' if not review.get('TestUrl', ''): review['TestUrl'] = response.url review['ProductName'] = self.extract( response.xpath( "//div[@class='productDataBlock']/ul/li[1]/strong/text()")) if not review.get('ProductName', ''): review['ProductName'] = self.get_product_name(response) source_internal_id = str(response).split("/")[4] review['source_internal_id'] = source_internal_id.rstrip('>') review['TestPros'] = self.extract( response.xpath("//div[@id='ahReviewPros']/ul/li/text()")) review['TestCons'] = self.extract( response.xpath("//div[@id='ahReviewCons']/ul/li/text()")) return review
def parse_reviews(self, response): product = response.meta['product'] review = response.meta['review'] product['TestUrl'] = response.url review['TestVerdict'] = self.extract_all(response.xpath( '//h4[contains(text(),"Wrap") or contains(text(),"Conclusion")]/following-sibling::p//text()' ), separator=" ") if not review['TestVerdict']: review['TestVerdict'] = self.extract_all(response.xpath( '//h3[contains(text(),"Wrap") or contains(text(),"Conclusion")]/following-sibling::p//text()' ), separator=" ") review['DBaseCategoryName'] = "PRO" review['TestUrl'] = response.url review['TestPros'] = self.extract_all(response.xpath( "//div[contains(@class, 'review-pros')]//li/text()"), separator=' ; ') review['TestCons'] = self.extract_all(response.xpath( "//div[contains(@class, 'review-cons')]//li/text()"), separator=' ; ') review_json_ld = extruct_helper.extract_json_ld( response.text, 'Review') if review_json_ld: review = extruct_helper.review_item_from_review_json_ld( review_json_ld, review) yield product yield review
def parse_review(self, response): review = ReviewItem() # Parsing using XPath xpaths = { 'TestSummary': '//meta[@property="og:description"]/@content', 'TestPros': '//*[@class="rs-review--positives"]//span/text()', 'TestCons': '//*[@class="rs-review--negatives"]//span/text()', 'source_internal_id': '//div[@data-widget="article-edit"]/@data-meta', 'ProductName': '//section/header/h1/text()', } # Extract data = {} for key in xpaths: data[key] = response.xpath(xpaths[key]).extract() # Process if (len(data['source_internal_id']) > 0): data['source_internal_id'] = json.loads( data['source_internal_id'][0]).get('id') data['TestPros'] = ';'.join(data['TestPros']) data['TestCons'] = ';'.join(data['TestCons']) data['TestSummary'] = data['TestSummary'][0] data['ProductName'] = data['ProductName'][0] for key in xpaths: review[key] = data[key] # Parsing using JSON-LD # Populates: # Author, SourceTestRating, SourceTestScale, TestDateText, TestTitle review_json_ld = extruct_helper.extract_json_ld( response.text, 'Review') if review_json_ld: review = extruct_helper.review_item_from_review_json_ld( review_json_ld, review) review['TestUrl'] = response.url review['source_id'] = self.spider_conf["source_id"] review['DBaseCategoryName'] = 'PRO' # There are some occurences of "null" in the TestTile and TestSummary if review['TestTitle'] == 'null': review['TestTitle'] = review['ProductName'] if review['TestSummary'] == 'null': review['TestSummary'] = '' return review
def test_review_item_from_review_json_ld_default_best_rating(self): html_text = '''<script type="application/ld+json"> { "@context":"http://schema.org/", "@type":"Review", "itemReviewed":{"@type":"Product","name":"OnePlus 5"}, "reviewRating":{"@type":"Rating","ratingValue":5} } </script>''' json_ld = extruct_helper.extract_json_ld(html_text, 'Review') review = extruct_helper.review_item_from_review_json_ld(json_ld) self.assertIsNotNone(review) self.assertEqual(int(review['SourceTestScale']), 5)
def test_review_item_from_review_json_ld_full_review(self): html_text = '''<script type="application/ld+json"> { "@context": "http://schema.org/", "@type": "Review", "itemReviewed": { "@type": "Product", "name": "OnePlus 5" }, "author": { "@type": "Person", "name": "Joe" }, "reviewRating": { "@type": "Rating", "ratingValue": "7", "bestRating": "10" }, "publisher": { "@type": "Organization", "name": "CNET" }, "datePublished":"2017-08-07", "headline":"OnePlus 5 review", "description":"The OnePlus 5 is one of the best phones you can buy today" } </script>''' json_ld = extruct_helper.extract_json_ld(html_text, 'Review') review = extruct_helper.review_item_from_review_json_ld(json_ld) self.assertIsNotNone(review) self.assertEqual(review['ProductName'], 'OnePlus 5') self.assertEqual(review['Author'], 'Joe') self.assertEquals(review['TestDateText'], '2017-08-07') self.assertEqual(int(review['SourceTestRating']), 7) self.assertEqual(int(review['SourceTestScale']), 10) self.assertEqual(review['TestTitle'], 'OnePlus 5 review') self.assertEqual( review['TestSummary'], 'The OnePlus 5 is one of the best phones you can buy today')
def parse_review(self, response): # TODO verdict not found and source_id not found product_xpath = {"PicURL": "//*[@property='og:image']/@content"} review_xpaths = { "TestSummary": "//*[@property='og:description']/@content", "TestPros": "//div[@id='wired-tired']//p[1]/text()", "TestCons": "//div[@id='wired-tired']//p[2]/text()", "TestDateText": "(//meta[@itemprop='datePublished'])[1]/@content", } product = self.init_item_by_xpaths(response, "product", product_xpath) review = self.init_item_by_xpaths(response, "review", review_xpaths) # utilize structured data # -------------------------------------------------- # get review from structured data 'Review' review_json_ld = extruct_helper.extract_json_ld( response.text, 'Review') if review_json_ld: review = extruct_helper.review_item_from_review_json_ld( review_json_ld, review) # get title from structure data 'NewsArticle', or Product # ------------------------------------------------------- # wired.com use the format #'Review: [product name] | wired'as title # most of the time title = '' news_article_json_ld = extruct_helper.extract_json_ld( response.text, 'NewsArticle') product_json_ld = extruct_helper.extract_json_ld( response.text, 'Product') if news_article_json_ld: title = news_article_json_ld.get('headline').strip() review['TestTitle'] = title elif product_json_ld: title = product_json_ld.get('name').strip() review['TestTitle'] = title # double check product name # -------------------------------------------------- product_name = review.get('ProductName') if not product_name: if title.startswith('Review:'): PRODUCT_INDEX = 1 product_name = title.split(':')[PRODUCT_INDEX].strip() else: product_name = title.split('Review')[0].strip() # get rid of the the last part of 'product_name | wired' if '|' in product_name: product_name = product_name.split('|')[0].strip() review['ProductName'] = product_name product['ProductName'] = product_name # double check date # -------------------------------------------------- date = review['TestDateText'] if not date: date_xpath = "//meta[@name='parsely-pub-date']/@content" date = self.extract(response.xpath(date_xpath)) review['TestDateText'] = date_format(date, '') # double check author # -------------------------------------------------- author = review.get('Author', '') if not author: author_xpath = "//span[@itemprop='author']/a/text()" author = self.extract(response.xpath(author_xpath)) if author: review['Author'] = author # parse category using tags category = self.get_categories_from_tags(response) if category: yield category if self.should_skip_category(category): return product['OriginalCategoryName'] = category['category_path'] # double check PicURL for product # -------------------------------------------------- pic_url = product.get('PicURL') if not pic_url: pic_url_xpath = "(//div[contains(@class, 'gallery-pic')]//img)[1]/@src" pic_url = self.extract_xpath(response, pic_url_xpath) if pic_url: product['PicURL'] = pic_url yield product # double check review rating # -------------------------------------------------- rating_value = review.get('SourceTestRating') if not rating_value: rating_text_xpath = "//h3[contains(text(), 'RATING')]/following-sibling::p//text()" rating_text = self.extract_xpath(response, rating_text_xpath) rating_re = r'([0-9]+)' if rating_text: rating_match = re.search(rating_re, rating_text) if rating_match: rating = rating_match.group(0) review['SourceTestRating'] = rating REVIEW_SCALE = unicode('10') review['SourceTestScale'] = REVIEW_SCALE review["DBaseCategoryName"] = "PRO" yield review
def parse_review(self, response): product_xpaths = {"PicURL": "//*[@property='og:image']/@content"} review_xpaths = { "TestTitle": "//*[@property='og:title']/@content", "TestSummary": '//meta[@property="og:description"]/@content', "TestVerdict": '//a[@id="conclusion"]/following::p[1]/text()', "TestPros": '//div[@class="iconProText"]/text()', "TestCons": '//div[@class="iconConText"]/text()', 'Author': '//div[@class="small"]/a/text()', } product_name = '' product = self.init_item_by_xpaths(response, "product", product_xpaths) review = self.init_item_by_xpaths(response, "review", review_xpaths) # utilize structured data review_json_ld = extruct_helper.extract_json_ld( response.text, 'Review') if review_json_ld: review = extruct_helper.review_item_from_review_json_ld( review_json_ld, review) product_name = review_json_ld.get('itemReviewed', {}).get('name', '') product_name = product_name.split('review')[0].strip() # incremental if review.get('TestDateText', ''): review['TestDateText'] = date_format(review['TestDateText'], '') else: test_date_xpath = '//div[@class="small"][2]/text()' test_date = self.extract(response.xpath(test_date_xpath)) test_date = parse(test_date) test_date = test_date.strftime("%Y-%m-%d") review['TestDateText'] = test_date if not product_name: title_xpath = "//h1/text()" title = self.extract(response.xpath(title_xpath)) if title: product_name = title.split('review')[0].strip() product['ProductName'] = product_name review['ProductName'] = product['ProductName'] category_path_xpath = "//span[@itemprop='name']/text()" all_category_names = self.extract_all( response.xpath(category_path_xpath), separator=' | ') product['OriginalCategoryName'] = all_category_names source_int_id = response.url source_int_id = source_int_id.split('/')[-1] product['source_internal_id'] = source_int_id review['source_internal_id'] = source_int_id if product.get('OriginalCategoryName', ''): category = CategoryItem() category['category_path'] = product['OriginalCategoryName'] yield category yield product award_xpath = "//td/div[2]/img[contains(@alt, 'Award')]" award = response.xpath(award_xpath) if award: award_name = self.extract_xpath(award, './@alt') award_image_url = self.extract_xpath(award, './@src') if award_name and award_image_url: review['award'] = 'TechGearLab ' + award_name review['AwardPic'] = award_image_url review["DBaseCategoryName"] = "PRO" yield review
def parse_review(self, response): category = response.meta['category'] product_xpaths = { "source_internal_id": u"//div[@class='article_content']/descendant-or-self::*[./@data-product-id][1]/@data-product-id", "ProductName": u"normalize-space(//h1)", "PicURL": u"//meta[@property='og:image']/@content", } product = self.init_item_by_xpaths(response, "product", product_xpaths) product['ProductName'] = remove_suffix(product['ProductName'], ' Review') product['TestUrl'] = response.url product['OriginalCategoryName'] = category['category_path'] if product.get('PicURL', ''): product['PicURL'] = get_full_url(response, product['PicURL']) review_xpaths = { "TestDateText": u"//*[@itemprop='datePublished']/@content", "TestPros": u"//p[.//*[contains(text(),'Pros')]]/text()", "TestCons": u"//p[.//*[contains(text(),'Cons')]]/text()", "TestSummary": u"string(//h2[text()='Summary']/following::p)", "TestVerdict": u"//p[.//*[contains(text(),'Verdict')]]/text()", "TestTitle": u"normalize-space(//h1)", "award": u"(//a[contains(@alt, 'Award')])[1]/@alt", "AwardPic": u"(//a[contains(@alt, 'Award')])[1]/@data-bgset" } review = self.init_item_by_xpaths(response, "review", review_xpaths) review['TestUrl'] = response.url review_json_ld = extruct_helper.extract_json_ld( response.text, 'Review') if review_json_ld: review = extruct_helper.review_item_from_review_json_ld( review_json_ld, review) if review.get('ProductName'): product['ProductName'] = review['ProductName'] else: review['ProductName'] = product['ProductName'] awpic_link = review.get("AwardPic", "") if awpic_link: review["AwardPic"] = get_full_url(response, awpic_link) # Not a detailed review, can only get summary and verdict if not (review['TestSummary'] or review['TestVerdict'] or review['TestPros'] or review['TestCons']): summary_alt_xpath = "string(//section[@id='Intro']/p[1])" verdict_alt_xpath = "string(//section[@id='Intro']/p[last()])" review['TestSummary'] = self.extract( response.xpath(summary_alt_xpath)) review['TestVerdict'] = self.extract( response.xpath(verdict_alt_xpath)) review["DBaseCategoryName"] = "PRO" review["SourceTestScale"] = "10" yield product yield review
def parse_review(self, response): review_xpaths = { "TestTitle": "//*[@property='og:title']/@content", "TestSummary": "//*[@property='og:description']/@content", "TestVerdict": "//section[@class='review-body']//*[contains(text(),'Conclusion')]/ancestor::p//text()", "TestPros":"//div[@class='pros-cons-bl']//*[contains(text(),'Pros')]//parent::li//p[@class='summary']//text()", "TestCons":"//div[@class='pros-cons-bl']//*[contains(text(),'Cons')]//parent::li//p[@class='summary']//text()", } product_name_xpath = "//h1[contains(@class,'item')]/text()" internal_id_xpath = "//meta[@name='article-id']/@content" award_xpath = "//div[@class='editors-logo']/img/@src" product = response.meta['product'] review = self.init_item_by_xpaths(response, "review", review_xpaths) # get category category = CategoryItem() breadcrumbs_json_ld = extruct_helper.extract_json_ld(response.text, 'BreadcrumbList') if breadcrumbs_json_ld: category = extruct_helper.leaf_category_item_from_breadcrumbs_json_ld(breadcrumbs_json_ld, category) yield category if self.should_skip_category(category): return category_name = category['category_path'] product["OriginalCategoryName"] = category_name product['TestUrl'] = response.url review['TestUrl'] = product['TestUrl'] review_json_ld = extruct_helper.extract_json_ld(response.text, 'Review') if review_json_ld: review = extruct_helper.review_item_from_review_json_ld(review_json_ld, _review=review) product['ProductName'] = review['ProductName'] else: product['ProductName'] = self.extract(response.xpath(product_name_xpath)) review['ProductName'] = product['ProductName'] if review.get("TestDateText", ''): review["TestDateText"] = date_format(review["TestDateText"], "%Y-%m-%dT%H:%M:%S") alt_verdict_xpath = "string(//section[@class='review-body']//*[contains(text(),'Conclusion')]/following::p[ string-length(.//text()) > 0 ][1])" alt_verdict_xpath2 = "string(//div[contains(@class, 'article-footer')]/preceding::p[ string-length(.//text()) > 0 ][1])" if not review['TestVerdict']: review['TestVerdict'] = self.extract_all(response.xpath(alt_verdict_xpath)) if not review['TestVerdict']: review['TestVerdict'] = self.extract_all(response.xpath(alt_verdict_xpath2)) internal_id = self.extract(response.xpath(internal_id_xpath)) if internal_id: product['source_internal_id'] = internal_id review['source_internal_id'] = internal_id product_id_item = self.product_id(product, kind='pcmag_internal_id', value=internal_id) yield product_id_item ec_award_url = self.extract(response.xpath(award_xpath)) if ec_award_url: review['AwardPic'] = get_full_url(response, ec_award_url) review['award'] = "Editor's Choice" review["DBaseCategoryName"] = "PRO" review["SourceTestScale"] = "5" yield product yield review
def parse_review(self, response): category_path_xpath = "(//div[@class='popular_groups']//li/a)[1]/text()" category = CategoryItem() category['category_path'] = self.extract( response.xpath(category_path_xpath)) if self.should_skip_category(category): return yield category microdata_items = extruct_helper.get_microdata_extruct_items( response.text) if not microdata_items: return source_internal_id_re = r'/review/[^/]+/' source_internal_id = '' match = re.search(source_internal_id_re, response.url) if match: source_internal_id = match.group(1) product = ProductItem.from_response( response, category, source_internal_id=source_internal_id) review = list( extruct_helper.get_reviews_microdata_extruct(microdata_items, product, review_type='PRO')) if len(review) > 1: self.logger.error( 'Found more than 1 reviews in {0} through microdata'.format( response.url)) return review = review[0] print product print review return product_xpaths = { "ProductName": "//h1[@itemprop='headline']/text()", "PicURL": "//meta[@property='og:image']/@content", } product = self.init_item_by_xpaths(response, "product", product_xpaths) product['TestUrl'] = response.url picurl = product.get("PicURL", "") if picurl and picurl[:2] == "//": product["PicURL"] = "https:" + product["PicURL"] if picurl and picurl[:1] == "/": product["PicURL"] = get_full_url(response.url, picurl) product['OriginalCategoryName'] = category['category_path'] review_xpaths = { "TestTitle": "//*[@property='og:title']/@content", "TestPros": "//div[div[text()='The good'] or span[text()='The good']]//li/text()", "TestCons": "//div[div[text()='The bad'] or span[text()='The bad']]//li/text()", "TestSummary": "//h3[.//text() = 'Bottom Line' or .//text() = 'Bottom line' or .//text = 'Verdict']/" "following-sibling::*[ .//text()[normalize-space()] ][1]//text()", "TestVerdict": "//div[div[text()='Verdict'] or span[text()='Verdict']]//p/text()", } review = self.init_item_by_xpaths(response, "review", review_xpaths) review['TestUrl'] = response.url review_json_ld = extruct_helper.extract_json_ld( response.text, 'Review') if review_json_ld: review = extruct_helper.review_item_from_review_json_ld( review_json_ld, review) if not review.get('TestDateText'): review['TestDateText'] = self.extract( response.xpath("//meta[@itemprop='datePublished']/@content")) if review["TestDateText"]: review["TestDateText"] = review["TestDateText"].strip() review["TestDateText"] = date_format(review["TestDateText"], "%Y-%m-%d") if review.get('ProductName', ''): product['ProductName'] = review['ProductName'] else: title = review["TestTitle"].lower() if ":" in title: all_title_parts = title.split(":") for part in all_title_parts: review["ProductName"] = part.replace( "review", "") if 'review' in part else title.replace( "review", "") else: review["ProductName"] = title.replace("review", "") review["ProductName"] = review["ProductName"].strip("-: ") product["ProductName"] = review["ProductName"] internal_id_re = r',review-(.*)\.html' match = re.search(internal_id_re, response.url) if match: internal_id = match.group(1) product['source_internal_id'] = internal_id review['source_internal_id'] = internal_id product_id = self.product_id(product, kind='tomsguide_en_internal_id', value=internal_id) yield product_id alt_summary_xpath = "//div[@class='sbbl-content-text']/p//text()" if not review['TestSummary']: review['TestSummary'] = self.extract( response.xpath(alt_summary_xpath)) alt_verdict_xpath = "//div[div[text()='Verdict'] or span[text()='Verdict']]//div/text()" if not review['TestVerdict']: review['TestVerdict'] = self.extract( response.xpath(alt_verdict_xpath)) # only get summary from article description if both verdict and summary are empty, # or else summary and verdict may end up to be the same if not review['TestSummary'] and not review['TestVerdict']: review['TestSummary'] = self.extract( response.xpath("//meta[@name='description']/@content")) review["DBaseCategoryName"] = "PRO" ec_award_xpath = "//section[contains(@class, 'page-content-leftcol')]//div[@class='editor-pick']" ec_award = response.xpath(ec_award_xpath) if ec_award: review['award'] = "Editor's Choice" review[ 'AwardPic'] = "http://qa901.office.alatest.se/omt-award-images/tomsguide_en_editor_pick.png" yield product yield review
def parse_review(self, response): product_xpaths = { 'PicURL': '//meta[@property="og:image"]/@content', 'OriginalCategoryName': '//div[@class="dennis-kicker"]/a/text()' } review_xpaths = { 'TestSummary': '//meta[@property="og:description"]/@content', 'TestPros': '//div[contains(@class, "field-name-field-pros")]' '/div[@class="field-items"]//text()', 'TestCons': '//div[contains(@class, "field-name-field-cons")]' '/div[@class="field-items"]//text()', 'TestDateText': '//span[@class="date-display-single"]/text()', 'Author': '//span[@class="field field-name-field-author ' 'field-type-node-reference field-label-hidden"]/' 'span[@class="field-item even"]/text() | //div[@class="field ' 'field-name-author-names-combined field-type-text ' 'field-label-hidden"]/div[@class="field-items"]/div' '[@class="field-item even"]/a' } product_name = '' product = self.init_item_by_xpaths(response, 'product', product_xpaths) review = self.init_item_by_xpaths(response, 'review', review_xpaths) title_xpath = '//meta[@property="og:title"]/@content' title = self.extract(response.xpath(title_xpath)) review_json_ld = extruct_helper.extract_json_ld( response.text, 'Review') if review_json_ld: review = extruct_helper.review_item_from_review_json_ld( review_json_ld, review) product_name = review_json_ld.get('itemReviewed', {}).get('name', '') product_name = product_name.split('review')[0].strip() # get review date and do incremental scraping if review.get('TestDateText', ''): review['TestDateText'] = date_format(review['TestDateText'], '') if not product_name: # title_xpath = '//meta[@property="og:title"]/@content' # title = self.extract(response.xpath(title_xpath)) product_name = title.split('review')[0].strip() product['ProductName'] = product_name review['ProductName'] = product['ProductName'] review['TestTitle'] = title # product['TestTitle'] = title category_url_xpath = '//div[contains(@class, '\ '"field-category-primary")]//a/@href' if product.get('OriginalCategoryName', ''): category = CategoryItem() category['category_leaf'] = product['OriginalCategoryName'] category['category_path'] = product['OriginalCategoryName'] category['category_url'] = get_full_url( response, self.extract(response.xpath(category_url_xpath))) yield category # award_xpath = '//div[contains(@class, "group-media")]//div[contains # (@class, "field-name-field-award-image")]//img/@src' # award = response.xpath(award_xpath) # if award: # award_re = r'(.*)\s+Logo' # award_name = award.xpath('./@title').re_first(award_re) # award_image_url = self.extract_xpath(award, './@src') # if award_name and award_image_url: # review['award'] = award_name # review['AwardPic'] = award_image_url internal_id = '' internal_id_url_xpath = '//meta[@property="og:url"]/@content' internal_id_re = r'go/([0-9]+)' internal_id_url = self.extract_xpath(response, internal_id_url_xpath) if internal_id_url: internal_id_match = re.search(internal_id_re, internal_id_url) if internal_id_match: internal_id = internal_id_match.group(1) else: internal_id = internal_id_url.split('/')[-2] if not internal_id or not internal_id.isdigit(): internal_id = response.url.split('/')[-2] if internal_id and internal_id.isdigit(): product_id = ProductIdItem() product_id['ProductName'] = product['ProductName'] product_id['source_internal_id'] = internal_id product_id['ID_kind'] = 'expertreviews_internal_id' product_id['ID_value'] = internal_id yield product_id product['source_internal_id'] = internal_id yield product review['DBaseCategoryName'] = 'PRO' review['SourceTestScale'] = '5' review['source_internal_id'] = product['source_internal_id'] verdict_page_xpath = '//section[@class="pagination mn_background"]'\ '//li[last()]/a/@href' verdict_page_url = self.extract(response.xpath(verdict_page_xpath)) if verdict_page_url: verdict_page_url = get_full_url(response, verdict_page_url) request = Request(verdict_page_url, callback=self.get_test_verdict) request.meta['review'] = review yield request else: test_verdict_xpath = '(//div[contains(@class, "field-name-body")]'\ '//p[ not(strong) and .//text()[normalize-space()]and'\ ' .//text()[not(starts-with(., "Buy"))]and '\ './/text()[not(starts-with(., "BUY"))] ])[last()]//text()' review["TestVerdict"] = self.extract_all( response.xpath(test_verdict_xpath), separator='', keep_whitespace=True) yield review