Example #1
0
    def parse_review(self, response):

        review_json_ld = extruct_helper.extract_json_ld(
            response.text, "Review")
        article_json_ld = extruct_helper.extract_json_ld(
            response.text, "NewsArticle")

        if review_json_ld:
            review = extruct_helper.review_item_from_review_json_ld(
                review_json_ld)
        elif article_json_ld:
            review = extruct_helper.review_item_from_article_json_ld(
                article_json_ld)
        else:
            review = ReviewItem()

        review['DBaseCategoryName'] = 'PRO'
        if not review.get('TestUrl', ''):
            review['TestUrl'] = response.url

        review['ProductName'] = self.extract(
            response.xpath(
                "//div[@class='productDataBlock']/ul/li[1]/strong/text()"))
        if not review.get('ProductName', ''):
            review['ProductName'] = self.get_product_name(response)

        source_internal_id = str(response).split("/")[4]
        review['source_internal_id'] = source_internal_id.rstrip('>')

        review['TestPros'] = self.extract(
            response.xpath("//div[@id='ahReviewPros']/ul/li/text()"))
        review['TestCons'] = self.extract(
            response.xpath("//div[@id='ahReviewCons']/ul/li/text()"))

        return review
Example #2
0
    def parse_reviews(self, response):
        product = response.meta['product']
        review = response.meta['review']

        product['TestUrl'] = response.url

        review['TestVerdict'] = self.extract_all(response.xpath(
            '//h4[contains(text(),"Wrap") or contains(text(),"Conclusion")]/following-sibling::p//text()'
        ),
                                                 separator=" ")
        if not review['TestVerdict']:
            review['TestVerdict'] = self.extract_all(response.xpath(
                '//h3[contains(text(),"Wrap") or contains(text(),"Conclusion")]/following-sibling::p//text()'
            ),
                                                     separator=" ")

        review['DBaseCategoryName'] = "PRO"
        review['TestUrl'] = response.url

        review['TestPros'] = self.extract_all(response.xpath(
            "//div[contains(@class, 'review-pros')]//li/text()"),
                                              separator=' ; ')
        review['TestCons'] = self.extract_all(response.xpath(
            "//div[contains(@class, 'review-cons')]//li/text()"),
                                              separator=' ; ')

        review_json_ld = extruct_helper.extract_json_ld(
            response.text, 'Review')
        if review_json_ld:
            review = extruct_helper.review_item_from_review_json_ld(
                review_json_ld, review)

        yield product
        yield review
Example #3
0
    def parse_review(self, response):
        review = ReviewItem()

        # Parsing using XPath
        xpaths = {
            'TestSummary': '//meta[@property="og:description"]/@content',
            'TestPros': '//*[@class="rs-review--positives"]//span/text()',
            'TestCons': '//*[@class="rs-review--negatives"]//span/text()',
            'source_internal_id':
            '//div[@data-widget="article-edit"]/@data-meta',
            'ProductName': '//section/header/h1/text()',
        }

        # Extract
        data = {}
        for key in xpaths:
            data[key] = response.xpath(xpaths[key]).extract()

        # Process
        if (len(data['source_internal_id']) > 0):
            data['source_internal_id'] = json.loads(
                data['source_internal_id'][0]).get('id')
        data['TestPros'] = ';'.join(data['TestPros'])
        data['TestCons'] = ';'.join(data['TestCons'])
        data['TestSummary'] = data['TestSummary'][0]
        data['ProductName'] = data['ProductName'][0]

        for key in xpaths:
            review[key] = data[key]

        # Parsing using JSON-LD
        # Populates:
        # Author, SourceTestRating, SourceTestScale, TestDateText, TestTitle

        review_json_ld = extruct_helper.extract_json_ld(
            response.text, 'Review')

        if review_json_ld:
            review = extruct_helper.review_item_from_review_json_ld(
                review_json_ld, review)

        review['TestUrl'] = response.url
        review['source_id'] = self.spider_conf["source_id"]
        review['DBaseCategoryName'] = 'PRO'

        # There are some occurences of "null" in the TestTile and TestSummary
        if review['TestTitle'] == 'null':
            review['TestTitle'] = review['ProductName']
        if review['TestSummary'] == 'null':
            review['TestSummary'] = ''

        return review
 def test_review_item_from_review_json_ld_default_best_rating(self):
     html_text = '''<script type="application/ld+json">
                    {
                        "@context":"http://schema.org/",
                        "@type":"Review",
                        "itemReviewed":{"@type":"Product","name":"OnePlus 5"},
                        "reviewRating":{"@type":"Rating","ratingValue":5}
                    }
                    </script>'''
     json_ld = extruct_helper.extract_json_ld(html_text, 'Review')
     review = extruct_helper.review_item_from_review_json_ld(json_ld)
     self.assertIsNotNone(review)
     self.assertEqual(int(review['SourceTestScale']), 5)
 def test_review_item_from_review_json_ld_full_review(self):
     html_text = '''<script type="application/ld+json">
                    {
                       "@context": "http://schema.org/",
                       "@type": "Review",
                       "itemReviewed": {
                         "@type": "Product",
                         "name": "OnePlus 5"
                       },
                       "author": {
                         "@type": "Person",
                         "name": "Joe"
                       },
                       "reviewRating": {
                         "@type": "Rating",
                         "ratingValue": "7",
                         "bestRating": "10"
                       },
                       "publisher": {
                         "@type": "Organization",
                         "name": "CNET"
                       },
                       "datePublished":"2017-08-07",
                       "headline":"OnePlus 5 review",
                       "description":"The OnePlus 5 is one of the best phones you can buy today"
                     }
                     </script>'''
     json_ld = extruct_helper.extract_json_ld(html_text, 'Review')
     review = extruct_helper.review_item_from_review_json_ld(json_ld)
     self.assertIsNotNone(review)
     self.assertEqual(review['ProductName'], 'OnePlus 5')
     self.assertEqual(review['Author'], 'Joe')
     self.assertEquals(review['TestDateText'], '2017-08-07')
     self.assertEqual(int(review['SourceTestRating']), 7)
     self.assertEqual(int(review['SourceTestScale']), 10)
     self.assertEqual(review['TestTitle'], 'OnePlus 5 review')
     self.assertEqual(
         review['TestSummary'],
         'The OnePlus 5 is one of the best phones you can buy today')
Example #6
0
    def parse_review(self, response):
        # TODO verdict not found and source_id not found

        product_xpath = {"PicURL": "//*[@property='og:image']/@content"}
        review_xpaths = {
            "TestSummary": "//*[@property='og:description']/@content",
            "TestPros": "//div[@id='wired-tired']//p[1]/text()",
            "TestCons": "//div[@id='wired-tired']//p[2]/text()",
            "TestDateText": "(//meta[@itemprop='datePublished'])[1]/@content",
        }

        product = self.init_item_by_xpaths(response, "product", product_xpath)
        review = self.init_item_by_xpaths(response, "review", review_xpaths)

        # utilize structured data
        # --------------------------------------------------
        # get review from structured data 'Review'
        review_json_ld = extruct_helper.extract_json_ld(
            response.text, 'Review')
        if review_json_ld:
            review = extruct_helper.review_item_from_review_json_ld(
                review_json_ld, review)

        # get title from structure data 'NewsArticle', or Product
        # -------------------------------------------------------
        # wired.com use the format
        #'Review: [product name] | wired'as title
        # most of the time
        title = ''
        news_article_json_ld = extruct_helper.extract_json_ld(
            response.text, 'NewsArticle')
        product_json_ld = extruct_helper.extract_json_ld(
            response.text, 'Product')

        if news_article_json_ld:
            title = news_article_json_ld.get('headline').strip()
            review['TestTitle'] = title
        elif product_json_ld:
            title = product_json_ld.get('name').strip()
            review['TestTitle'] = title

        # double check product name
        # --------------------------------------------------
        product_name = review.get('ProductName')
        if not product_name:
            if title.startswith('Review:'):
                PRODUCT_INDEX = 1
                product_name = title.split(':')[PRODUCT_INDEX].strip()
            else:
                product_name = title.split('Review')[0].strip()

            # get rid of the the last part of 'product_name | wired'
            if '|' in product_name:
                product_name = product_name.split('|')[0].strip()

            review['ProductName'] = product_name

        product['ProductName'] = product_name

        # double check date
        # --------------------------------------------------
        date = review['TestDateText']
        if not date:
            date_xpath = "//meta[@name='parsely-pub-date']/@content"
            date = self.extract(response.xpath(date_xpath))
        review['TestDateText'] = date_format(date, '')

        # double check author
        # --------------------------------------------------
        author = review.get('Author', '')
        if not author:
            author_xpath = "//span[@itemprop='author']/a/text()"
            author = self.extract(response.xpath(author_xpath))
            if author:
                review['Author'] = author

        # parse category using tags
        category = self.get_categories_from_tags(response)
        if category:
            yield category
            if self.should_skip_category(category):
                return
            product['OriginalCategoryName'] = category['category_path']

        # double check PicURL for product
        # --------------------------------------------------
        pic_url = product.get('PicURL')
        if not pic_url:
            pic_url_xpath = "(//div[contains(@class, 'gallery-pic')]//img)[1]/@src"
            pic_url = self.extract_xpath(response, pic_url_xpath)
            if pic_url:
                product['PicURL'] = pic_url

        yield product

        # double check review rating
        # --------------------------------------------------
        rating_value = review.get('SourceTestRating')
        if not rating_value:
            rating_text_xpath = "//h3[contains(text(), 'RATING')]/following-sibling::p//text()"
            rating_text = self.extract_xpath(response, rating_text_xpath)
            rating_re = r'([0-9]+)'
            if rating_text:
                rating_match = re.search(rating_re, rating_text)
                if rating_match:
                    rating = rating_match.group(0)
                    review['SourceTestRating'] = rating
                    REVIEW_SCALE = unicode('10')
                    review['SourceTestScale'] = REVIEW_SCALE

        review["DBaseCategoryName"] = "PRO"
        yield review
    def parse_review(self, response):
        product_xpaths = {"PicURL": "//*[@property='og:image']/@content"}

        review_xpaths = {
            "TestTitle": "//*[@property='og:title']/@content",
            "TestSummary": '//meta[@property="og:description"]/@content',
            "TestVerdict": '//a[@id="conclusion"]/following::p[1]/text()',
            "TestPros": '//div[@class="iconProText"]/text()',
            "TestCons": '//div[@class="iconConText"]/text()',
            'Author': '//div[@class="small"]/a/text()',
        }

        product_name = ''
        product = self.init_item_by_xpaths(response, "product", product_xpaths)
        review = self.init_item_by_xpaths(response, "review", review_xpaths)

        # utilize structured data
        review_json_ld = extruct_helper.extract_json_ld(
            response.text, 'Review')
        if review_json_ld:
            review = extruct_helper.review_item_from_review_json_ld(
                review_json_ld, review)
            product_name = review_json_ld.get('itemReviewed',
                                              {}).get('name', '')
            product_name = product_name.split('review')[0].strip()

        # incremental
        if review.get('TestDateText', ''):
            review['TestDateText'] = date_format(review['TestDateText'], '')

        else:
            test_date_xpath = '//div[@class="small"][2]/text()'
            test_date = self.extract(response.xpath(test_date_xpath))
            test_date = parse(test_date)
            test_date = test_date.strftime("%Y-%m-%d")
            review['TestDateText'] = test_date

        if not product_name:
            title_xpath = "//h1/text()"
            title = self.extract(response.xpath(title_xpath))
            if title:
                product_name = title.split('review')[0].strip()

        product['ProductName'] = product_name
        review['ProductName'] = product['ProductName']

        category_path_xpath = "//span[@itemprop='name']/text()"
        all_category_names = self.extract_all(
            response.xpath(category_path_xpath), separator=' | ')
        product['OriginalCategoryName'] = all_category_names

        source_int_id = response.url
        source_int_id = source_int_id.split('/')[-1]
        product['source_internal_id'] = source_int_id
        review['source_internal_id'] = source_int_id

        if product.get('OriginalCategoryName', ''):
            category = CategoryItem()
            category['category_path'] = product['OriginalCategoryName']
            yield category

        yield product

        award_xpath = "//td/div[2]/img[contains(@alt, 'Award')]"
        award = response.xpath(award_xpath)
        if award:
            award_name = self.extract_xpath(award, './@alt')
            award_image_url = self.extract_xpath(award, './@src')
            if award_name and award_image_url:
                review['award'] = 'TechGearLab ' + award_name
                review['AwardPic'] = award_image_url

        review["DBaseCategoryName"] = "PRO"

        yield review
Example #8
0
    def parse_review(self, response):
        category = response.meta['category']

        product_xpaths = {
            "source_internal_id":
            u"//div[@class='article_content']/descendant-or-self::*[./@data-product-id][1]/@data-product-id",
            "ProductName": u"normalize-space(//h1)",
            "PicURL": u"//meta[@property='og:image']/@content",
        }
        product = self.init_item_by_xpaths(response, "product", product_xpaths)
        product['ProductName'] = remove_suffix(product['ProductName'],
                                               ' Review')
        product['TestUrl'] = response.url
        product['OriginalCategoryName'] = category['category_path']

        if product.get('PicURL', ''):
            product['PicURL'] = get_full_url(response, product['PicURL'])

        review_xpaths = {
            "TestDateText": u"//*[@itemprop='datePublished']/@content",
            "TestPros": u"//p[.//*[contains(text(),'Pros')]]/text()",
            "TestCons": u"//p[.//*[contains(text(),'Cons')]]/text()",
            "TestSummary": u"string(//h2[text()='Summary']/following::p)",
            "TestVerdict": u"//p[.//*[contains(text(),'Verdict')]]/text()",
            "TestTitle": u"normalize-space(//h1)",
            "award": u"(//a[contains(@alt, 'Award')])[1]/@alt",
            "AwardPic": u"(//a[contains(@alt, 'Award')])[1]/@data-bgset"
        }
        review = self.init_item_by_xpaths(response, "review", review_xpaths)
        review['TestUrl'] = response.url

        review_json_ld = extruct_helper.extract_json_ld(
            response.text, 'Review')
        if review_json_ld:
            review = extruct_helper.review_item_from_review_json_ld(
                review_json_ld, review)

        if review.get('ProductName'):
            product['ProductName'] = review['ProductName']
        else:
            review['ProductName'] = product['ProductName']

        awpic_link = review.get("AwardPic", "")
        if awpic_link:
            review["AwardPic"] = get_full_url(response, awpic_link)

        # Not a detailed review, can only get summary and verdict
        if not (review['TestSummary'] or review['TestVerdict']
                or review['TestPros'] or review['TestCons']):
            summary_alt_xpath = "string(//section[@id='Intro']/p[1])"
            verdict_alt_xpath = "string(//section[@id='Intro']/p[last()])"
            review['TestSummary'] = self.extract(
                response.xpath(summary_alt_xpath))
            review['TestVerdict'] = self.extract(
                response.xpath(verdict_alt_xpath))

        review["DBaseCategoryName"] = "PRO"
        review["SourceTestScale"] = "10"

        yield product
        yield review
Example #9
0
    def parse_review(self, response):
        review_xpaths = { "TestTitle": "//*[@property='og:title']/@content",
                          "TestSummary": "//*[@property='og:description']/@content",
                          "TestVerdict": "//section[@class='review-body']//*[contains(text(),'Conclusion')]/ancestor::p//text()",
                          "TestPros":"//div[@class='pros-cons-bl']//*[contains(text(),'Pros')]//parent::li//p[@class='summary']//text()", 
                          "TestCons":"//div[@class='pros-cons-bl']//*[contains(text(),'Cons')]//parent::li//p[@class='summary']//text()",
                        }

        product_name_xpath = "//h1[contains(@class,'item')]/text()"
        internal_id_xpath = "//meta[@name='article-id']/@content"
        award_xpath = "//div[@class='editors-logo']/img/@src"

        product = response.meta['product']
        review = self.init_item_by_xpaths(response, "review", review_xpaths)

        # get category
        category = CategoryItem()
        breadcrumbs_json_ld = extruct_helper.extract_json_ld(response.text, 'BreadcrumbList')
        if breadcrumbs_json_ld:
            category = extruct_helper.leaf_category_item_from_breadcrumbs_json_ld(breadcrumbs_json_ld, category)
            yield category
            if self.should_skip_category(category):
                return

            category_name = category['category_path']
            product["OriginalCategoryName"] = category_name

        product['TestUrl'] = response.url
        review['TestUrl'] = product['TestUrl']

        review_json_ld = extruct_helper.extract_json_ld(response.text, 'Review')
        if review_json_ld:
            review = extruct_helper.review_item_from_review_json_ld(review_json_ld, _review=review)
            product['ProductName'] = review['ProductName']
        else:
            product['ProductName'] = self.extract(response.xpath(product_name_xpath))
            review['ProductName'] = product['ProductName']

        if review.get("TestDateText", ''):
            review["TestDateText"] = date_format(review["TestDateText"],
                                                 "%Y-%m-%dT%H:%M:%S")

        alt_verdict_xpath = "string(//section[@class='review-body']//*[contains(text(),'Conclusion')]/following::p[ string-length(.//text()) > 0 ][1])"
        alt_verdict_xpath2 = "string(//div[contains(@class, 'article-footer')]/preceding::p[ string-length(.//text()) > 0 ][1])"
        if not review['TestVerdict']:
            review['TestVerdict'] = self.extract_all(response.xpath(alt_verdict_xpath))
        if not review['TestVerdict']:
            review['TestVerdict'] = self.extract_all(response.xpath(alt_verdict_xpath2))

        internal_id = self.extract(response.xpath(internal_id_xpath))
        if internal_id:
            product['source_internal_id'] = internal_id
            review['source_internal_id'] = internal_id
            product_id_item = self.product_id(product, kind='pcmag_internal_id', value=internal_id)
            yield product_id_item

        ec_award_url = self.extract(response.xpath(award_xpath))
        if ec_award_url:
            review['AwardPic'] = get_full_url(response, ec_award_url)
            review['award'] = "Editor's Choice"

        review["DBaseCategoryName"] = "PRO"
        review["SourceTestScale"] = "5"

        yield product
        yield review
    def parse_review(self, response):
        category_path_xpath = "(//div[@class='popular_groups']//li/a)[1]/text()"
        category = CategoryItem()
        category['category_path'] = self.extract(
            response.xpath(category_path_xpath))

        if self.should_skip_category(category):
            return

        yield category

        microdata_items = extruct_helper.get_microdata_extruct_items(
            response.text)
        if not microdata_items:
            return

        source_internal_id_re = r'/review/[^/]+/'
        source_internal_id = ''
        match = re.search(source_internal_id_re, response.url)
        if match:
            source_internal_id = match.group(1)

        product = ProductItem.from_response(
            response, category, source_internal_id=source_internal_id)
        review = list(
            extruct_helper.get_reviews_microdata_extruct(microdata_items,
                                                         product,
                                                         review_type='PRO'))

        if len(review) > 1:
            self.logger.error(
                'Found more than 1 reviews in {0} through microdata'.format(
                    response.url))
            return

        review = review[0]
        print product
        print review

        return

        product_xpaths = {
            "ProductName": "//h1[@itemprop='headline']/text()",
            "PicURL": "//meta[@property='og:image']/@content",
        }
        product = self.init_item_by_xpaths(response, "product", product_xpaths)
        product['TestUrl'] = response.url
        picurl = product.get("PicURL", "")
        if picurl and picurl[:2] == "//":
            product["PicURL"] = "https:" + product["PicURL"]
        if picurl and picurl[:1] == "/":
            product["PicURL"] = get_full_url(response.url, picurl)

        product['OriginalCategoryName'] = category['category_path']

        review_xpaths = {
            "TestTitle":
            "//*[@property='og:title']/@content",
            "TestPros":
            "//div[div[text()='The good'] or span[text()='The good']]//li/text()",
            "TestCons":
            "//div[div[text()='The bad'] or span[text()='The bad']]//li/text()",
            "TestSummary":
            "//h3[.//text() = 'Bottom Line' or .//text() = 'Bottom line' or .//text = 'Verdict']/"
            "following-sibling::*[ .//text()[normalize-space()] ][1]//text()",
            "TestVerdict":
            "//div[div[text()='Verdict'] or span[text()='Verdict']]//p/text()",
        }
        review = self.init_item_by_xpaths(response, "review", review_xpaths)
        review['TestUrl'] = response.url

        review_json_ld = extruct_helper.extract_json_ld(
            response.text, 'Review')
        if review_json_ld:
            review = extruct_helper.review_item_from_review_json_ld(
                review_json_ld, review)

        if not review.get('TestDateText'):
            review['TestDateText'] = self.extract(
                response.xpath("//meta[@itemprop='datePublished']/@content"))

        if review["TestDateText"]:
            review["TestDateText"] = review["TestDateText"].strip()
            review["TestDateText"] = date_format(review["TestDateText"],
                                                 "%Y-%m-%d")

        if review.get('ProductName', ''):
            product['ProductName'] = review['ProductName']
        else:
            title = review["TestTitle"].lower()
            if ":" in title:
                all_title_parts = title.split(":")
                for part in all_title_parts:
                    review["ProductName"] = part.replace(
                        "review", "") if 'review' in part else title.replace(
                            "review", "")
            else:
                review["ProductName"] = title.replace("review", "")
                review["ProductName"] = review["ProductName"].strip("-: ")
                product["ProductName"] = review["ProductName"]

        internal_id_re = r',review-(.*)\.html'
        match = re.search(internal_id_re, response.url)
        if match:
            internal_id = match.group(1)
            product['source_internal_id'] = internal_id
            review['source_internal_id'] = internal_id

            product_id = self.product_id(product,
                                         kind='tomsguide_en_internal_id',
                                         value=internal_id)
            yield product_id

        alt_summary_xpath = "//div[@class='sbbl-content-text']/p//text()"
        if not review['TestSummary']:
            review['TestSummary'] = self.extract(
                response.xpath(alt_summary_xpath))

        alt_verdict_xpath = "//div[div[text()='Verdict'] or span[text()='Verdict']]//div/text()"
        if not review['TestVerdict']:
            review['TestVerdict'] = self.extract(
                response.xpath(alt_verdict_xpath))

        # only get summary from article description if both verdict and summary are empty,
        # or else summary and verdict may end up to be the same
        if not review['TestSummary'] and not review['TestVerdict']:
            review['TestSummary'] = self.extract(
                response.xpath("//meta[@name='description']/@content"))

        review["DBaseCategoryName"] = "PRO"

        ec_award_xpath = "//section[contains(@class, 'page-content-leftcol')]//div[@class='editor-pick']"
        ec_award = response.xpath(ec_award_xpath)
        if ec_award:
            review['award'] = "Editor's Choice"
            review[
                'AwardPic'] = "http://qa901.office.alatest.se/omt-award-images/tomsguide_en_editor_pick.png"

        yield product
        yield review
    def parse_review(self, response):
        product_xpaths = {
            'PicURL': '//meta[@property="og:image"]/@content',
            'OriginalCategoryName': '//div[@class="dennis-kicker"]/a/text()'
        }

        review_xpaths = {
            'TestSummary':
            '//meta[@property="og:description"]/@content',
            'TestPros':
            '//div[contains(@class, "field-name-field-pros")]'
            '/div[@class="field-items"]//text()',
            'TestCons':
            '//div[contains(@class, "field-name-field-cons")]'
            '/div[@class="field-items"]//text()',
            'TestDateText':
            '//span[@class="date-display-single"]/text()',
            'Author':
            '//span[@class="field field-name-field-author '
            'field-type-node-reference field-label-hidden"]/'
            'span[@class="field-item even"]/text() | //div[@class="field '
            'field-name-author-names-combined field-type-text '
            'field-label-hidden"]/div[@class="field-items"]/div'
            '[@class="field-item even"]/a'
        }

        product_name = ''
        product = self.init_item_by_xpaths(response, 'product', product_xpaths)
        review = self.init_item_by_xpaths(response, 'review', review_xpaths)

        title_xpath = '//meta[@property="og:title"]/@content'
        title = self.extract(response.xpath(title_xpath))

        review_json_ld = extruct_helper.extract_json_ld(
            response.text, 'Review')
        if review_json_ld:
            review = extruct_helper.review_item_from_review_json_ld(
                review_json_ld, review)
            product_name = review_json_ld.get('itemReviewed',
                                              {}).get('name', '')
            product_name = product_name.split('review')[0].strip()

        # get review date and do incremental scraping
        if review.get('TestDateText', ''):
            review['TestDateText'] = date_format(review['TestDateText'], '')

        if not product_name:
            # title_xpath = '//meta[@property="og:title"]/@content'
            # title = self.extract(response.xpath(title_xpath))
            product_name = title.split('review')[0].strip()

        product['ProductName'] = product_name
        review['ProductName'] = product['ProductName']
        review['TestTitle'] = title
        # product['TestTitle'] = title

        category_url_xpath = '//div[contains(@class, '\
            '"field-category-primary")]//a/@href'

        if product.get('OriginalCategoryName', ''):
            category = CategoryItem()
            category['category_leaf'] = product['OriginalCategoryName']
            category['category_path'] = product['OriginalCategoryName']
            category['category_url'] = get_full_url(
                response, self.extract(response.xpath(category_url_xpath)))
            yield category

        # award_xpath = '//div[contains(@class, "group-media")]//div[contains
        # (@class, "field-name-field-award-image")]//img/@src'
        # award = response.xpath(award_xpath)
        # if award:
        #     award_re = r'(.*)\s+Logo'
        #     award_name = award.xpath('./@title').re_first(award_re)
        #     award_image_url = self.extract_xpath(award, './@src')
        #     if award_name and award_image_url:
        #         review['award'] = award_name
        #         review['AwardPic'] = award_image_url

        internal_id = ''
        internal_id_url_xpath = '//meta[@property="og:url"]/@content'
        internal_id_re = r'go/([0-9]+)'

        internal_id_url = self.extract_xpath(response, internal_id_url_xpath)
        if internal_id_url:
            internal_id_match = re.search(internal_id_re, internal_id_url)
            if internal_id_match:
                internal_id = internal_id_match.group(1)
            else:
                internal_id = internal_id_url.split('/')[-2]

        if not internal_id or not internal_id.isdigit():
            internal_id = response.url.split('/')[-2]

        if internal_id and internal_id.isdigit():
            product_id = ProductIdItem()
            product_id['ProductName'] = product['ProductName']
            product_id['source_internal_id'] = internal_id
            product_id['ID_kind'] = 'expertreviews_internal_id'
            product_id['ID_value'] = internal_id
            yield product_id

        product['source_internal_id'] = internal_id
        yield product

        review['DBaseCategoryName'] = 'PRO'
        review['SourceTestScale'] = '5'
        review['source_internal_id'] = product['source_internal_id']

        verdict_page_xpath = '//section[@class="pagination mn_background"]'\
            '//li[last()]/a/@href'
        verdict_page_url = self.extract(response.xpath(verdict_page_xpath))
        if verdict_page_url:
            verdict_page_url = get_full_url(response, verdict_page_url)
            request = Request(verdict_page_url, callback=self.get_test_verdict)
            request.meta['review'] = review
            yield request
        else:
            test_verdict_xpath = '(//div[contains(@class, "field-name-body")]'\
                '//p[ not(strong) and .//text()[normalize-space()]and'\
                ' .//text()[not(starts-with(., "Buy"))]and '\
                './/text()[not(starts-with(., "BUY"))] ])[last()]//text()'
            review["TestVerdict"] = self.extract_all(
                response.xpath(test_verdict_xpath),
                separator='',
                keep_whitespace=True)
            yield review