Python extract_json_ld Examples, alascrapy.lib.extruct_helper.extract_json_ld Python Examples

Example #1

0

Show file

    def parse_review(self, response):

        review_json_ld = extruct_helper.extract_json_ld(
            response.text, "Review")
        article_json_ld = extruct_helper.extract_json_ld(
            response.text, "NewsArticle")

        if review_json_ld:
            review = extruct_helper.review_item_from_review_json_ld(
                review_json_ld)
        elif article_json_ld:
            review = extruct_helper.review_item_from_article_json_ld(
                article_json_ld)
        else:
            review = ReviewItem()

        review['DBaseCategoryName'] = 'PRO'
        if not review.get('TestUrl', ''):
            review['TestUrl'] = response.url

        review['ProductName'] = self.extract(
            response.xpath(
                "//div[@class='productDataBlock']/ul/li[1]/strong/text()"))
        if not review.get('ProductName', ''):
            review['ProductName'] = self.get_product_name(response)

        source_internal_id = str(response).split("/")[4]
        review['source_internal_id'] = source_internal_id.rstrip('>')

        review['TestPros'] = self.extract(
            response.xpath("//div[@id='ahReviewPros']/ul/li/text()"))
        review['TestCons'] = self.extract(
            response.xpath("//div[@id='ahReviewCons']/ul/li/text()"))

        return review

Example #2

0

Show file

    def parse_product(self, response):
        # The category names extracted in category pages are not very detailed,
        # extract it in product page instead
        category = ''
        category_json_ld = extruct_helper.extract_json_ld(response.body, 'BreadcrumbList')
        if category_json_ld:
            category = extruct_helper.category_item_from_breadcrumbs_json_ld(category_json_ld)
            yield category
            if self.should_skip_category(category):
                return
        # TODO: retry if we fail to get JSON-LD?

        sku = ''
        product_json_ld = extruct_helper.extract_json_ld(response.body, 'Product')
        if product_json_ld:
            product = extruct_helper.product_item_from_product_json_ld(product_json_ld)
            sku = product_json_ld.get('sku', None)
        else:
            # Not sure why we fail to extract JSON-LD from some pages, it will be good if we can figure out later
            product_xpaths = {"PicURL": "(//*[@property='og:image'])[1]/@content",
                              "ProductName": "//h1[contains(@class, 'page-title')]/span//text()",
                              "ProductManufacturer": "//h1[contains(@class,'page-title')]/span[1]/text()"
                              }
            product = self.init_item_by_xpaths(response, "product", product_xpaths)

        if not sku:
            sku_xpath = "//p[@class='prd-code']/text()"
            sku = self.extract(response.xpath(sku_xpath))
            if sku:
                splitted = sku.split(': ')
                if splitted:
                    sku = splitted[-1]

        product['TestUrl'] = response.url
        if category:
            product['OriginalCategoryName'] = category['category_path']

        if sku:
            product['source_internal_id'] = sku
            product_id = self.product_id(product=product, kind='currys_internal_id', value=sku)
            yield product_id

        yield product

        reevoo_review_id = ''
        match = re.search(self.reevoo_review_id_re, response.url)
        if match:
            reevoo_review_id = match.group(1)
        if reevoo_review_id:
            # TODO: test if the url is valid or not?
            review_url = self.review_url_format.format(reevoo_review_id)
            request = Request(url=review_url, callback=self.parse_review)
            request.meta['product'] = product
            request.meta['rating_xpath'] = ".//div[@class='overall_score_stars']/@title"
            yield request

Example #3

0

Show file

    def parse_review(self, response):
        category_json_ld = extruct_helper.extract_json_ld(
            response.body, 'BreadcrumbList')
        review_xpaths = {
            "SourceTestRating":
            "(//span[contains(@class,'rating')]/@title)[1]",
            "TestPros":
            "(//ul[contains(@class,'plusmin-list')])[1]"
            "//li[contains(@class,'plusmin-item')]//text()",
            "TestCons":
            "(//ul[contains(@class,'plusmin-list')])[2]"
            "//li[contains(@class,'plusmin-item')]//text()"
        }
        review = self.init_item_by_xpaths(response, "review", review_xpaths)
        review_json_ld = extruct_helper.extract_json_ld \
            (response.body, 'Review')
        if review_json_ld:
            review = extruct_helper.review_item_from_article_json_ld(
                review_json_ld, review)

        # different scale based rating system
        if not review.get('SourceTestRating', ''):
            rating_xpath = "//span[contains(@class, 'starrating')]/span[contains(@class, 'value-title')]/text()"
            rating_str = self.extract(response.xpath(rating_xpath))
            # new rating scale at 100
            rating_ratio = 100 / 5
            try:
                if rating_str:
                    rating_unified_str = (float(rating_str)) / rating_ratio
                    review["SourceTestRating"] = rating_unified_str
            except ValueError, e:
                print(e)
                print('rating_str is: {}').format(rating_str)

Example #4

0

Show file

    def parse_product(self, response):
        category = response.meta['category']
        review_url_xpath = "//div[@class='product-page--title-links']//a[@class='review-rating--reviews-link']/@href"
        match = re.search(self.product_url_re, response.url)
        if match:
            source_internal_id = match.group(1)
        else:
            self.logger.error('Failed to get source internal id for product at: {}'.format(response.url))
            return

        json_ld = extruct_helper.extract_json_ld(response.text, 'Product')
        if not json_ld:
            request = self._retry(response.request)
            yield request
            return

        product = extruct_helper.product_item_from_product_json_ld(json_ld)
        product['TestUrl'] = response.url
        product['source_internal_id'] = source_internal_id
        product['OriginalCategoryName'] = category['category_path']
        yield product

        review_url = self.extract_xpath(response, review_url_xpath)
        if review_url:
            review_url = get_full_url(response, review_url) + '?sorteer=date%20desc'
            request = Request(review_url, callback=self.parse_reviews)
            request.meta['product'] = product
            yield request

Example #5

0

Show file

    def parse_product(self, response):
        product_xpaths = {
            'PicURL':
            '//a[@class="lb-show"]/@href',
            'ProductName':
            '//div[@class="productDataBlock"]/ul/li[1]/strong/text()',
            'ProductManufacturer':
            '//div[@class="productDataBlock item"]/'
            'ul/li[contains(text(), "Manufacturer")]/strong/span/text()',
        }

        product = self.init_item_by_xpaths(response, "product", product_xpaths)
        if not product.get('ProductName', ''):
            product['ProductName'] = self.get_product_name(response)

        source_internal_id = str(response).split("/")[4]
        product['source_internal_id'] = source_internal_id.rstrip('>')

        breadcrumb_json_ld = extruct_helper.extract_json_ld(
            response.text, "BreadcrumbList")
        if breadcrumb_json_ld:
            items = breadcrumb_json_ld.get('itemListElement', None)
            if items and len(items) > 1:
                product['OriginalCategoryName'] = items[1].get('item', {}).get(
                    'name', '')

        return product

Example #6

0

Show file

    def parse_product(self, response):
        category = response.meta['category']
        product = ProductItem()
        product['TestUrl'] = response.url
        product['OriginalCategoryName'] = category['category_path']

        product['ProductName'] = ''
        product['PicURL'] = ''
        product_json_ld = extruct_helper.extract_json_ld(
            response.text, 'Product')
        if product_json_ld:
            product['ProductName'] = product_json_ld.get('name', '')
            product['PicURL'] = product_json_ld.get('image', '')
        else:
            # TODO: add fallback plan?
            return

        parsed_url = urlparse(response.url)
        splited = parsed_url.path.split('/')
        if splited:
            product["source_internal_id"] = splited[-1]
        yield product

        internal_id = self.product_id(product,
                                      kind='reevoo_internal_id',
                                      value=product['source_internal_id'])
        yield internal_id

        # TODO: test if the url is valid or not?
        review_url = self.review_url_format.format(
            product["source_internal_id"])
        request = Request(review_url, callback=self.parse_review)
        request.meta['product'] = product
        yield request

Example #7

0

Show file

    def parse_category(self, response):
        products_xpath = "//div[@data-component='product-list-view']/article/div[@class='desc']"
        next_page_xpath = "//a[@class='next']/@href"

        product_url_xpath = "./a/@href"
        has_review_xpath = ".//*[contains(@class, 'reevoo-score')]"

        products = response.xpath(products_xpath)
        if not products:
            return

        # This category may be too general, but it helps if we know it can be skipped
        category_json_ld = extruct_helper.extract_json_ld(response.body, 'BreadcrumbList')
        if category_json_ld:
            category = extruct_helper.category_item_from_breadcrumbs_json_ld(category_json_ld)
            yield category
            if self.should_skip_category(category):
                return

        for product in products:
            has_review = product.xpath(has_review_xpath)
            if not has_review:
                continue
            product_url = self.extract(product.xpath(product_url_xpath))
            request = Request(url=get_full_url(response, product_url), callback=self.parse_product)
            yield request

        next_page_url = self.extract(response.xpath(next_page_xpath))
        if next_page_url:
            next_page_url = get_full_url(response, next_page_url)
            request = Request(next_page_url, callback=self.parse_category)
            yield request

Example #8

0

Show file

    def parse_reviews(self, response):
        product = response.meta['product']
        review = response.meta['review']

        product['TestUrl'] = response.url

        review['TestVerdict'] = self.extract_all(response.xpath(
            '//h4[contains(text(),"Wrap") or contains(text(),"Conclusion")]/following-sibling::p//text()'
        ),
                                                 separator=" ")
        if not review['TestVerdict']:
            review['TestVerdict'] = self.extract_all(response.xpath(
                '//h3[contains(text(),"Wrap") or contains(text(),"Conclusion")]/following-sibling::p//text()'
            ),
                                                     separator=" ")

        review['DBaseCategoryName'] = "PRO"
        review['TestUrl'] = response.url

        review['TestPros'] = self.extract_all(response.xpath(
            "//div[contains(@class, 'review-pros')]//li/text()"),
                                              separator=' ; ')
        review['TestCons'] = self.extract_all(response.xpath(
            "//div[contains(@class, 'review-cons')]//li/text()"),
                                              separator=' ; ')

        review_json_ld = extruct_helper.extract_json_ld(
            response.text, 'Review')
        if review_json_ld:
            review = extruct_helper.review_item_from_review_json_ld(
                review_json_ld, review)

        yield product
        yield review

Example #9

0

Show file

    def parse_review(self, response):
        review = ReviewItem()

        # Parsing using XPath
        xpaths = {
            'TestSummary': '//meta[@property="og:description"]/@content',
            'TestPros': '//*[@class="rs-review--positives"]//span/text()',
            'TestCons': '//*[@class="rs-review--negatives"]//span/text()',
            'source_internal_id':
            '//div[@data-widget="article-edit"]/@data-meta',
            'ProductName': '//section/header/h1/text()',
        }

        # Extract
        data = {}
        for key in xpaths:
            data[key] = response.xpath(xpaths[key]).extract()

        # Process
        if (len(data['source_internal_id']) > 0):
            data['source_internal_id'] = json.loads(
                data['source_internal_id'][0]).get('id')
        data['TestPros'] = ';'.join(data['TestPros'])
        data['TestCons'] = ';'.join(data['TestCons'])
        data['TestSummary'] = data['TestSummary'][0]
        data['ProductName'] = data['ProductName'][0]

        for key in xpaths:
            review[key] = data[key]

        # Parsing using JSON-LD
        # Populates:
        # Author, SourceTestRating, SourceTestScale, TestDateText, TestTitle

        review_json_ld = extruct_helper.extract_json_ld(
            response.text, 'Review')

        if review_json_ld:
            review = extruct_helper.review_item_from_review_json_ld(
                review_json_ld, review)

        review['TestUrl'] = response.url
        review['source_id'] = self.spider_conf["source_id"]
        review['DBaseCategoryName'] = 'PRO'

        # There are some occurences of "null" in the TestTile and TestSummary
        if review['TestTitle'] == 'null':
            review['TestTitle'] = review['ProductName']
        if review['TestSummary'] == 'null':
            review['TestSummary'] = ''

        return review

Example #10

0

Show file

File: test_extruct_helper.py Project: ADJet1437/ScrapyProject

 def test_review_item_from_review_json_ld_default_best_rating(self):
     html_text = '''<script type="application/ld+json">
                    {
                        "@context":"http://schema.org/",
                        "@type":"Review",
                        "itemReviewed":{"@type":"Product","name":"OnePlus 5"},
                        "reviewRating":{"@type":"Rating","ratingValue":5}
                    }
                    </script>'''
     json_ld = extruct_helper.extract_json_ld(html_text, 'Review')
     review = extruct_helper.review_item_from_review_json_ld(json_ld)
     self.assertIsNotNone(review)
     self.assertEqual(int(review['SourceTestScale']), 5)

Example #11

0

Show file

File: tomshardware_de.py Project: ADJet1437/ScrapyProject

    def parse_review(self, response):
        print('2. got to the parse_review page with {}').format(response.url)
        review = ReviewItem()
        category = CategoryItem()

        category_json_ld = extruct_helper.extract_json_ld(
            response.body, 'BreadcrumbList')
        if not category_json_ld:
            print('no category can be found')
            return
        category = extruct_helper.category_item_from_breadcrumbs_json_ld(
            category_json_ld)
        if self.should_skip_category(category):
            return
        yield category

        product_xpaths = {
            "PicURL": "//meta[@property='og:image']/@content",
            #"ProductName": "//meta[@property='og:title']/@content"
        }
        product = self.init_item_by_xpaths(response, "product", product_xpaths)
        if category:
            CategoryName = category['category_path']
            product["OriginalCategoryName"] = CategoryName.replace\
                (' | Testbericht', '')
        source_internal_id_re = r'Review_([0-9]+)'
        source_internal_id_xpath = "//article[contains(@id,'Review')]/@id"
        product["source_internal_id"] = response.xpath\
            (source_internal_id_xpath).re_first(source_internal_id_re)
        product_name_re = r'.de/(.*),testberichte'
        product_name_xpath = "//meta[@property='og:url']/@content"
        product_name = response.xpath(product_name_xpath).re_first\
            (product_name_re)
        product["ProductName"] = product_name.replace('-', ' ')
        yield product

        review_json_ld = extruct_helper.extract_json_ld\
            (response.body, 'Article')
        if review_json_ld:
            review = extruct_helper.review_item_from_article_json_ld(
                review_json_ld, review)
        review["ProductName"] = product["ProductName"]
        review["DBaseCategoryName"] = "PRO"
        review["TestUrl"] = product["TestUrl"]
        review["source_internal_id"] = product["source_internal_id"]
        yield review

Example #12

0

Show file

File: bcc_nl.py Project: ADJet1437/ScrapyProject

    def parse_product_json(self, response):
        product_json_ld = extruct_helper.extract_json_ld(
            response.body, 'Product')
        if product_json_ld:
            ocns = product_json_ld.get('category', '')
            if ocns:
                seperator = '/'
                ocns = ocns.split(seperator)
                ocn = ' | '.join(ocn for ocn in ocns)
                category = CategoryItem()
                category['category_path'] = ocn
                yield category

                if not self.should_skip_category(category):
                    product = extruct_helper.product_item_from_product_json_ld(
                        product_json_ld)
                    product['source_id'] = self.spider_conf['source_id']
                    product['TestUrl'] = response.url
                    product['source_internal_id'] = product_json_ld.get(
                        'productID', '')
                    product['OriginalCategoryName'] = ocn
                    yield product

                    # Product Price Item
                    # ----------------------------------------
                    price_str = product_json_ld.get('offers',
                                                    {}).get('price', '')
                    currency_str = product_json_ld.get('offers', {}).get(
                        'priceCurrency', '')
                    price_str = price_str + ' ' + currency_str
                    yield ProductIdItem.from_product(product,
                                                     kind='price',
                                                     value=price_str)

                    # Product SKU Item
                    # ----------------------------------------
                    sku_str = product_json_ld.get('sku', '')
                    yield ProductIdItem.from_product(product,
                                                     kind='SKU',
                                                     value=sku_str)

Example #13

0

Show file

File: test_extruct_helper.py Project: ADJet1437/ScrapyProject

 def test_review_item_from_review_json_ld_full_review(self):
     html_text = '''<script type="application/ld+json">
                    {
                       "@context": "http://schema.org/",
                       "@type": "Review",
                       "itemReviewed": {
                         "@type": "Product",
                         "name": "OnePlus 5"
                       },
                       "author": {
                         "@type": "Person",
                         "name": "Joe"
                       },
                       "reviewRating": {
                         "@type": "Rating",
                         "ratingValue": "7",
                         "bestRating": "10"
                       },
                       "publisher": {
                         "@type": "Organization",
                         "name": "CNET"
                       },
                       "datePublished":"2017-08-07",
                       "headline":"OnePlus 5 review",
                       "description":"The OnePlus 5 is one of the best phones you can buy today"
                     }
                     </script>'''
     json_ld = extruct_helper.extract_json_ld(html_text, 'Review')
     review = extruct_helper.review_item_from_review_json_ld(json_ld)
     self.assertIsNotNone(review)
     self.assertEqual(review['ProductName'], 'OnePlus 5')
     self.assertEqual(review['Author'], 'Joe')
     self.assertEquals(review['TestDateText'], '2017-08-07')
     self.assertEqual(int(review['SourceTestRating']), 7)
     self.assertEqual(int(review['SourceTestScale']), 10)
     self.assertEqual(review['TestTitle'], 'OnePlus 5 review')
     self.assertEqual(
         review['TestSummary'],
         'The OnePlus 5 is one of the best phones you can buy today')

Example #14

0

Show file

File: wired_com.py Project: ADJet1437/ScrapyProject

    def parse_review(self, response):
        # TODO verdict not found and source_id not found

        product_xpath = {"PicURL": "//*[@property='og:image']/@content"}
        review_xpaths = {
            "TestSummary": "//*[@property='og:description']/@content",
            "TestPros": "//div[@id='wired-tired']//p[1]/text()",
            "TestCons": "//div[@id='wired-tired']//p[2]/text()",
            "TestDateText": "(//meta[@itemprop='datePublished'])[1]/@content",
        }

        product = self.init_item_by_xpaths(response, "product", product_xpath)
        review = self.init_item_by_xpaths(response, "review", review_xpaths)

        # utilize structured data
        # --------------------------------------------------
        # get review from structured data 'Review'
        review_json_ld = extruct_helper.extract_json_ld(
            response.text, 'Review')
        if review_json_ld:
            review = extruct_helper.review_item_from_review_json_ld(
                review_json_ld, review)

        # get title from structure data 'NewsArticle', or Product
        # -------------------------------------------------------
        # wired.com use the format
        #'Review: [product name] | wired'as title
        # most of the time
        title = ''
        news_article_json_ld = extruct_helper.extract_json_ld(
            response.text, 'NewsArticle')
        product_json_ld = extruct_helper.extract_json_ld(
            response.text, 'Product')

        if news_article_json_ld:
            title = news_article_json_ld.get('headline').strip()
            review['TestTitle'] = title
        elif product_json_ld:
            title = product_json_ld.get('name').strip()
            review['TestTitle'] = title

        # double check product name
        # --------------------------------------------------
        product_name = review.get('ProductName')
        if not product_name:
            if title.startswith('Review:'):
                PRODUCT_INDEX = 1
                product_name = title.split(':')[PRODUCT_INDEX].strip()
            else:
                product_name = title.split('Review')[0].strip()

            # get rid of the the last part of 'product_name | wired'
            if '|' in product_name:
                product_name = product_name.split('|')[0].strip()

            review['ProductName'] = product_name

        product['ProductName'] = product_name

        # double check date
        # --------------------------------------------------
        date = review['TestDateText']
        if not date:
            date_xpath = "//meta[@name='parsely-pub-date']/@content"
            date = self.extract(response.xpath(date_xpath))
        review['TestDateText'] = date_format(date, '')

        # double check author
        # --------------------------------------------------
        author = review.get('Author', '')
        if not author:
            author_xpath = "//span[@itemprop='author']/a/text()"
            author = self.extract(response.xpath(author_xpath))
            if author:
                review['Author'] = author

        # parse category using tags
        category = self.get_categories_from_tags(response)
        if category:
            yield category
            if self.should_skip_category(category):
                return
            product['OriginalCategoryName'] = category['category_path']

        # double check PicURL for product
        # --------------------------------------------------
        pic_url = product.get('PicURL')
        if not pic_url:
            pic_url_xpath = "(//div[contains(@class, 'gallery-pic')]//img)[1]/@src"
            pic_url = self.extract_xpath(response, pic_url_xpath)
            if pic_url:
                product['PicURL'] = pic_url

        yield product

        # double check review rating
        # --------------------------------------------------
        rating_value = review.get('SourceTestRating')
        if not rating_value:
            rating_text_xpath = "//h3[contains(text(), 'RATING')]/following-sibling::p//text()"
            rating_text = self.extract_xpath(response, rating_text_xpath)
            rating_re = r'([0-9]+)'
            if rating_text:
                rating_match = re.search(rating_re, rating_text)
                if rating_match:
                    rating = rating_match.group(0)
                    review['SourceTestRating'] = rating
                    REVIEW_SCALE = unicode('10')
                    review['SourceTestScale'] = REVIEW_SCALE

        review["DBaseCategoryName"] = "PRO"
        yield review

Example #15

0

Show file

    def parse_review(self, response):
        review_xpaths = { "TestTitle": "//*[@property='og:title']/@content",
                          "TestSummary": "//*[@property='og:description']/@content",
                          "TestVerdict": "//section[@class='review-body']//*[contains(text(),'Conclusion')]/ancestor::p//text()",
                          "TestPros":"//div[@class='pros-cons-bl']//*[contains(text(),'Pros')]//parent::li//p[@class='summary']//text()", 
                          "TestCons":"//div[@class='pros-cons-bl']//*[contains(text(),'Cons')]//parent::li//p[@class='summary']//text()",
                        }

        product_name_xpath = "//h1[contains(@class,'item')]/text()"
        internal_id_xpath = "//meta[@name='article-id']/@content"
        award_xpath = "//div[@class='editors-logo']/img/@src"

        product = response.meta['product']
        review = self.init_item_by_xpaths(response, "review", review_xpaths)

        # get category
        category = CategoryItem()
        breadcrumbs_json_ld = extruct_helper.extract_json_ld(response.text, 'BreadcrumbList')
        if breadcrumbs_json_ld:
            category = extruct_helper.leaf_category_item_from_breadcrumbs_json_ld(breadcrumbs_json_ld, category)
            yield category
            if self.should_skip_category(category):
                return

            category_name = category['category_path']
            product["OriginalCategoryName"] = category_name

        product['TestUrl'] = response.url
        review['TestUrl'] = product['TestUrl']

        review_json_ld = extruct_helper.extract_json_ld(response.text, 'Review')
        if review_json_ld:
            review = extruct_helper.review_item_from_review_json_ld(review_json_ld, _review=review)
            product['ProductName'] = review['ProductName']
        else:
            product['ProductName'] = self.extract(response.xpath(product_name_xpath))
            review['ProductName'] = product['ProductName']

        if review.get("TestDateText", ''):
            review["TestDateText"] = date_format(review["TestDateText"],
                                                 "%Y-%m-%dT%H:%M:%S")

        alt_verdict_xpath = "string(//section[@class='review-body']//*[contains(text(),'Conclusion')]/following::p[ string-length(.//text()) > 0 ][1])"
        alt_verdict_xpath2 = "string(//div[contains(@class, 'article-footer')]/preceding::p[ string-length(.//text()) > 0 ][1])"
        if not review['TestVerdict']:
            review['TestVerdict'] = self.extract_all(response.xpath(alt_verdict_xpath))
        if not review['TestVerdict']:
            review['TestVerdict'] = self.extract_all(response.xpath(alt_verdict_xpath2))

        internal_id = self.extract(response.xpath(internal_id_xpath))
        if internal_id:
            product['source_internal_id'] = internal_id
            review['source_internal_id'] = internal_id
            product_id_item = self.product_id(product, kind='pcmag_internal_id', value=internal_id)
            yield product_id_item

        ec_award_url = self.extract(response.xpath(award_xpath))
        if ec_award_url:
            review['AwardPic'] = get_full_url(response, ec_award_url)
            review['award'] = "Editor's Choice"

        review["DBaseCategoryName"] = "PRO"
        review["SourceTestScale"] = "5"

        yield product
        yield review

Example #16

0

Show file

File: techgearlab_com.py Project: ADJet1437/ScrapyProject

    def parse_review(self, response):
        product_xpaths = {"PicURL": "//*[@property='og:image']/@content"}

        review_xpaths = {
            "TestTitle": "//*[@property='og:title']/@content",
            "TestSummary": '//meta[@property="og:description"]/@content',
            "TestVerdict": '//a[@id="conclusion"]/following::p[1]/text()',
            "TestPros": '//div[@class="iconProText"]/text()',
            "TestCons": '//div[@class="iconConText"]/text()',
            'Author': '//div[@class="small"]/a/text()',
        }

        product_name = ''
        product = self.init_item_by_xpaths(response, "product", product_xpaths)
        review = self.init_item_by_xpaths(response, "review", review_xpaths)

        # utilize structured data
        review_json_ld = extruct_helper.extract_json_ld(
            response.text, 'Review')
        if review_json_ld:
            review = extruct_helper.review_item_from_review_json_ld(
                review_json_ld, review)
            product_name = review_json_ld.get('itemReviewed',
                                              {}).get('name', '')
            product_name = product_name.split('review')[0].strip()

        # incremental
        if review.get('TestDateText', ''):
            review['TestDateText'] = date_format(review['TestDateText'], '')

        else:
            test_date_xpath = '//div[@class="small"][2]/text()'
            test_date = self.extract(response.xpath(test_date_xpath))
            test_date = parse(test_date)
            test_date = test_date.strftime("%Y-%m-%d")
            review['TestDateText'] = test_date

        if not product_name:
            title_xpath = "//h1/text()"
            title = self.extract(response.xpath(title_xpath))
            if title:
                product_name = title.split('review')[0].strip()

        product['ProductName'] = product_name
        review['ProductName'] = product['ProductName']

        category_path_xpath = "//span[@itemprop='name']/text()"
        all_category_names = self.extract_all(
            response.xpath(category_path_xpath), separator=' | ')
        product['OriginalCategoryName'] = all_category_names

        source_int_id = response.url
        source_int_id = source_int_id.split('/')[-1]
        product['source_internal_id'] = source_int_id
        review['source_internal_id'] = source_int_id

        if product.get('OriginalCategoryName', ''):
            category = CategoryItem()
            category['category_path'] = product['OriginalCategoryName']
            yield category

        yield product

        award_xpath = "//td/div[2]/img[contains(@alt, 'Award')]"
        award = response.xpath(award_xpath)
        if award:
            award_name = self.extract_xpath(award, './@alt')
            award_image_url = self.extract_xpath(award, './@src')
            if award_name and award_image_url:
                review['award'] = 'TechGearLab ' + award_name
                review['AwardPic'] = award_image_url

        review["DBaseCategoryName"] = "PRO"

        yield review

Example #17

0

Show file

File: hardware_info_nl.py Project: ADJet1437/ScrapyProject

    def parse_review(self, response):
        category_path_xpath = "(//div[@class='popular_groups']//li/a)[1]/text()"
        category = CategoryItem()
        category['category_path'] = self.extract(
            response.xpath(category_path_xpath))

        if self.should_skip_category(category):
            return

        yield category

        microdata_items = extruct_helper.get_microdata_extruct_items(
            response.text)
        if not microdata_items:
            return

        source_internal_id_re = r'/review/[^/]+/'
        source_internal_id = ''
        match = re.search(source_internal_id_re, response.url)
        if match:
            source_internal_id = match.group(1)

        product = ProductItem.from_response(
            response, category, source_internal_id=source_internal_id)
        review = list(
            extruct_helper.get_reviews_microdata_extruct(microdata_items,
                                                         product,
                                                         review_type='PRO'))

        if len(review) > 1:
            self.logger.error(
                'Found more than 1 reviews in {0} through microdata'.format(
                    response.url))
            return

        review = review[0]
        print product
        print review

        return

        product_xpaths = {
            "ProductName": "//h1[@itemprop='headline']/text()",
            "PicURL": "//meta[@property='og:image']/@content",
        }
        product = self.init_item_by_xpaths(response, "product", product_xpaths)
        product['TestUrl'] = response.url
        picurl = product.get("PicURL", "")
        if picurl and picurl[:2] == "//":
            product["PicURL"] = "https:" + product["PicURL"]
        if picurl and picurl[:1] == "/":
            product["PicURL"] = get_full_url(response.url, picurl)

        product['OriginalCategoryName'] = category['category_path']

        review_xpaths = {
            "TestTitle":
            "//*[@property='og:title']/@content",
            "TestPros":
            "//div[div[text()='The good'] or span[text()='The good']]//li/text()",
            "TestCons":
            "//div[div[text()='The bad'] or span[text()='The bad']]//li/text()",
            "TestSummary":
            "//h3[.//text() = 'Bottom Line' or .//text() = 'Bottom line' or .//text = 'Verdict']/"
            "following-sibling::*[ .//text()[normalize-space()] ][1]//text()",
            "TestVerdict":
            "//div[div[text()='Verdict'] or span[text()='Verdict']]//p/text()",
        }
        review = self.init_item_by_xpaths(response, "review", review_xpaths)
        review['TestUrl'] = response.url

        review_json_ld = extruct_helper.extract_json_ld(
            response.text, 'Review')
        if review_json_ld:
            review = extruct_helper.review_item_from_review_json_ld(
                review_json_ld, review)

        if not review.get('TestDateText'):
            review['TestDateText'] = self.extract(
                response.xpath("//meta[@itemprop='datePublished']/@content"))

        if review["TestDateText"]:
            review["TestDateText"] = review["TestDateText"].strip()
            review["TestDateText"] = date_format(review["TestDateText"],
                                                 "%Y-%m-%d")

        if review.get('ProductName', ''):
            product['ProductName'] = review['ProductName']
        else:
            title = review["TestTitle"].lower()
            if ":" in title:
                all_title_parts = title.split(":")
                for part in all_title_parts:
                    review["ProductName"] = part.replace(
                        "review", "") if 'review' in part else title.replace(
                            "review", "")
            else:
                review["ProductName"] = title.replace("review", "")
                review["ProductName"] = review["ProductName"].strip("-: ")
                product["ProductName"] = review["ProductName"]

        internal_id_re = r',review-(.*)\.html'
        match = re.search(internal_id_re, response.url)
        if match:
            internal_id = match.group(1)
            product['source_internal_id'] = internal_id
            review['source_internal_id'] = internal_id

            product_id = self.product_id(product,
                                         kind='tomsguide_en_internal_id',
                                         value=internal_id)
            yield product_id

        alt_summary_xpath = "//div[@class='sbbl-content-text']/p//text()"
        if not review['TestSummary']:
            review['TestSummary'] = self.extract(
                response.xpath(alt_summary_xpath))

        alt_verdict_xpath = "//div[div[text()='Verdict'] or span[text()='Verdict']]//div/text()"
        if not review['TestVerdict']:
            review['TestVerdict'] = self.extract(
                response.xpath(alt_verdict_xpath))

        # only get summary from article description if both verdict and summary are empty,
        # or else summary and verdict may end up to be the same
        if not review['TestSummary'] and not review['TestVerdict']:
            review['TestSummary'] = self.extract(
                response.xpath("//meta[@name='description']/@content"))

        review["DBaseCategoryName"] = "PRO"

        ec_award_xpath = "//section[contains(@class, 'page-content-leftcol')]//div[@class='editor-pick']"
        ec_award = response.xpath(ec_award_xpath)
        if ec_award:
            review['award'] = "Editor's Choice"
            review[
                'AwardPic'] = "http://qa901.office.alatest.se/omt-award-images/tomsguide_en_editor_pick.png"

        yield product
        yield review

Example #18

0

Show file

File: feelunique_com.py Project: ADJet1437/ScrapyProject

    def parse_category(self, response):
        products_xpath = "//div[@class='Productlist']/div"
        product_sku_xpath = "./@data-sku"
        has_review_xpath = ".//span[@class='Rating-average']"
        next_page_xpath = "(//*[@rel='next'])[1]/@href"

        category = response.meta.get('category', '')
        if not category:
            # the category we get here is actually parent category
            category_json_ld = extruct_helper.extract_json_ld(
                response.text, 'BreadcrumbList')
            if not category_json_ld:
                request = self._retry(response.request)
                yield request
                return

            category = extruct_helper.category_item_from_breadcrumbs_json_ld(
                category_json_ld)
            current_category_name = self.extract(
                response.xpath(
                    "//div[@id='breadcrumb']/ul/li[@class='pad-left']/text()"))
            if current_category_name.lower(
            ) != category['category_leaf'].lower():
                category['category_leaf'] = current_category_name
                category['category_path'] = u'{} | {}'.format(
                    category['category_path'], current_category_name)
                category['category_url'] = response.url

            yield category

            if self.should_skip_category(category):
                return

        products = response.xpath(products_xpath)

        # Not a leaf category page
        if not products:
            return

        # We skip the product page, as feelunique.com tries to block us if we access too many of their pages,
        # but it is impossible for them to block the access to Bazaarvoice API
        for product in products:
            has_review = product.xpath(has_review_xpath)
            if not has_review:
                continue

            product_sku = self.extract(product.xpath(product_sku_xpath))
            if product_sku:
                product_id = ProductIdItem()
                product_id['source_internal_id'] = product_sku
                product_id['ID_kind'] = 'feelunique_internal_id'
                product_id['ID_value'] = product_sku
                yield product_id

                bv_params = self.bv_base_params.copy()
                bv_params['bv_id'] = product_sku
                bv_params['offset'] = 0
                review_url = self.get_review_url(**bv_params)

                last_user_review = incremental_utils.get_latest_user_review_date_by_sii(
                    self.mysql_manager, self.spider_conf['source_id'],
                    product_sku)

                request = Request(review_url, callback=self.parse_reviews)
                request.meta['last_user_review'] = last_user_review
                request.meta['filter_other_sources'] = False
                request.meta['OriginalCategoryName'] = category[
                    'category_path']
                request.meta['bv_id'] = product_sku
                yield request
            else:
                product_url_xpath = "./a/@href"
                product_url = self.extract(response.xpath(product_url_xpath))
                product_url = get_full_url(response, product_url)
                self.logger.info("Failed to get SKU for product at %s" %
                                 product_url)

        next_page_url = self.extract_xpath(response, next_page_xpath)
        if next_page_url:
            next_page_url = get_full_url(response, next_page_url)
            next_page_request = Request(next_page_url,
                                        callback=self.parse_category)
            next_page_request.meta['category'] = category
            yield next_page_request

Example #19

0

Show file

File: expertreviews_co_uk.py Project: ADJet1437/ScrapyProject

    def parse_review(self, response):
        product_xpaths = {
            'PicURL': '//meta[@property="og:image"]/@content',
            'OriginalCategoryName': '//div[@class="dennis-kicker"]/a/text()'
        }

        review_xpaths = {
            'TestSummary':
            '//meta[@property="og:description"]/@content',
            'TestPros':
            '//div[contains(@class, "field-name-field-pros")]'
            '/div[@class="field-items"]//text()',
            'TestCons':
            '//div[contains(@class, "field-name-field-cons")]'
            '/div[@class="field-items"]//text()',
            'TestDateText':
            '//span[@class="date-display-single"]/text()',
            'Author':
            '//span[@class="field field-name-field-author '
            'field-type-node-reference field-label-hidden"]/'
            'span[@class="field-item even"]/text() | //div[@class="field '
            'field-name-author-names-combined field-type-text '
            'field-label-hidden"]/div[@class="field-items"]/div'
            '[@class="field-item even"]/a'
        }

        product_name = ''
        product = self.init_item_by_xpaths(response, 'product', product_xpaths)
        review = self.init_item_by_xpaths(response, 'review', review_xpaths)

        title_xpath = '//meta[@property="og:title"]/@content'
        title = self.extract(response.xpath(title_xpath))

        review_json_ld = extruct_helper.extract_json_ld(
            response.text, 'Review')
        if review_json_ld:
            review = extruct_helper.review_item_from_review_json_ld(
                review_json_ld, review)
            product_name = review_json_ld.get('itemReviewed',
                                              {}).get('name', '')
            product_name = product_name.split('review')[0].strip()

        # get review date and do incremental scraping
        if review.get('TestDateText', ''):
            review['TestDateText'] = date_format(review['TestDateText'], '')

        if not product_name:
            # title_xpath = '//meta[@property="og:title"]/@content'
            # title = self.extract(response.xpath(title_xpath))
            product_name = title.split('review')[0].strip()

        product['ProductName'] = product_name
        review['ProductName'] = product['ProductName']
        review['TestTitle'] = title
        # product['TestTitle'] = title

        category_url_xpath = '//div[contains(@class, '\
            '"field-category-primary")]//a/@href'

        if product.get('OriginalCategoryName', ''):
            category = CategoryItem()
            category['category_leaf'] = product['OriginalCategoryName']
            category['category_path'] = product['OriginalCategoryName']
            category['category_url'] = get_full_url(
                response, self.extract(response.xpath(category_url_xpath)))
            yield category

        # award_xpath = '//div[contains(@class, "group-media")]//div[contains
        # (@class, "field-name-field-award-image")]//img/@src'
        # award = response.xpath(award_xpath)
        # if award:
        #     award_re = r'(.*)\s+Logo'
        #     award_name = award.xpath('./@title').re_first(award_re)
        #     award_image_url = self.extract_xpath(award, './@src')
        #     if award_name and award_image_url:
        #         review['award'] = award_name
        #         review['AwardPic'] = award_image_url

        internal_id = ''
        internal_id_url_xpath = '//meta[@property="og:url"]/@content'
        internal_id_re = r'go/([0-9]+)'

        internal_id_url = self.extract_xpath(response, internal_id_url_xpath)
        if internal_id_url:
            internal_id_match = re.search(internal_id_re, internal_id_url)
            if internal_id_match:
                internal_id = internal_id_match.group(1)
            else:
                internal_id = internal_id_url.split('/')[-2]

        if not internal_id or not internal_id.isdigit():
            internal_id = response.url.split('/')[-2]

        if internal_id and internal_id.isdigit():
            product_id = ProductIdItem()
            product_id['ProductName'] = product['ProductName']
            product_id['source_internal_id'] = internal_id
            product_id['ID_kind'] = 'expertreviews_internal_id'
            product_id['ID_value'] = internal_id
            yield product_id

        product['source_internal_id'] = internal_id
        yield product

        review['DBaseCategoryName'] = 'PRO'
        review['SourceTestScale'] = '5'
        review['source_internal_id'] = product['source_internal_id']

        verdict_page_xpath = '//section[@class="pagination mn_background"]'\
            '//li[last()]/a/@href'
        verdict_page_url = self.extract(response.xpath(verdict_page_xpath))
        if verdict_page_url:
            verdict_page_url = get_full_url(response, verdict_page_url)
            request = Request(verdict_page_url, callback=self.get_test_verdict)
            request.meta['review'] = review
            yield request
        else:
            test_verdict_xpath = '(//div[contains(@class, "field-name-body")]'\
                '//p[ not(strong) and .//text()[normalize-space()]and'\
                ' .//text()[not(starts-with(., "Buy"))]and '\
                './/text()[not(starts-with(., "BUY"))] ])[last()]//text()'
            review["TestVerdict"] = self.extract_all(
                response.xpath(test_verdict_xpath),
                separator='',
                keep_whitespace=True)
            yield review

Example #20

0

Show file

    def parse_review(self, response):
        category = response.meta['category']

        product_xpaths = {
            "source_internal_id":
            u"//div[@class='article_content']/descendant-or-self::*[./@data-product-id][1]/@data-product-id",
            "ProductName": u"normalize-space(//h1)",
            "PicURL": u"//meta[@property='og:image']/@content",
        }
        product = self.init_item_by_xpaths(response, "product", product_xpaths)
        product['ProductName'] = remove_suffix(product['ProductName'],
                                               ' Review')
        product['TestUrl'] = response.url
        product['OriginalCategoryName'] = category['category_path']

        if product.get('PicURL', ''):
            product['PicURL'] = get_full_url(response, product['PicURL'])

        review_xpaths = {
            "TestDateText": u"//*[@itemprop='datePublished']/@content",
            "TestPros": u"//p[.//*[contains(text(),'Pros')]]/text()",
            "TestCons": u"//p[.//*[contains(text(),'Cons')]]/text()",
            "TestSummary": u"string(//h2[text()='Summary']/following::p)",
            "TestVerdict": u"//p[.//*[contains(text(),'Verdict')]]/text()",
            "TestTitle": u"normalize-space(//h1)",
            "award": u"(//a[contains(@alt, 'Award')])[1]/@alt",
            "AwardPic": u"(//a[contains(@alt, 'Award')])[1]/@data-bgset"
        }
        review = self.init_item_by_xpaths(response, "review", review_xpaths)
        review['TestUrl'] = response.url

        review_json_ld = extruct_helper.extract_json_ld(
            response.text, 'Review')
        if review_json_ld:
            review = extruct_helper.review_item_from_review_json_ld(
                review_json_ld, review)

        if review.get('ProductName'):
            product['ProductName'] = review['ProductName']
        else:
            review['ProductName'] = product['ProductName']

        awpic_link = review.get("AwardPic", "")
        if awpic_link:
            review["AwardPic"] = get_full_url(response, awpic_link)

        # Not a detailed review, can only get summary and verdict
        if not (review['TestSummary'] or review['TestVerdict']
                or review['TestPros'] or review['TestCons']):
            summary_alt_xpath = "string(//section[@id='Intro']/p[1])"
            verdict_alt_xpath = "string(//section[@id='Intro']/p[last()])"
            review['TestSummary'] = self.extract(
                response.xpath(summary_alt_xpath))
            review['TestVerdict'] = self.extract(
                response.xpath(verdict_alt_xpath))

        review["DBaseCategoryName"] = "PRO"
        review["SourceTestScale"] = "10"

        yield product
        yield review