コード例 #1
0
    def parse_review(self, response):

        review_json_ld = extruct_helper.extract_json_ld(
            response.text, "Review")
        article_json_ld = extruct_helper.extract_json_ld(
            response.text, "NewsArticle")

        if review_json_ld:
            review = extruct_helper.review_item_from_review_json_ld(
                review_json_ld)
        elif article_json_ld:
            review = extruct_helper.review_item_from_article_json_ld(
                article_json_ld)
        else:
            review = ReviewItem()

        review['DBaseCategoryName'] = 'PRO'
        if not review.get('TestUrl', ''):
            review['TestUrl'] = response.url

        review['ProductName'] = self.extract(
            response.xpath(
                "//div[@class='productDataBlock']/ul/li[1]/strong/text()"))
        if not review.get('ProductName', ''):
            review['ProductName'] = self.get_product_name(response)

        source_internal_id = str(response).split("/")[4]
        review['source_internal_id'] = source_internal_id.rstrip('>')

        review['TestPros'] = self.extract(
            response.xpath("//div[@id='ahReviewPros']/ul/li/text()"))
        review['TestCons'] = self.extract(
            response.xpath("//div[@id='ahReviewCons']/ul/li/text()"))

        return review
コード例 #2
0
    def parse_review(self, node, response):
        review = ReviewItem()

        # No author for the source page
        meta_info = node.get('meta', {})
        review['ProductName'] = node.get('title', '')
        review['source_internal_id'] = meta_info.get('id', '')
        review['TestDateText'] = meta_info.get('review_date', '')
        review['TestSummary'] = node.get('description', '')
        review['TestTitle'] = review.get('ProductName')
        review['TestUrl'] = node.get('url', '')
        review['SourceTestRating'] = meta_info.get('expert_evaluation_float',
                                                   '')
        # source rating scale based on scale of 10
        if review.get('SourceTestRating'):
            review['SourceTestScale'] = 10
        review['source_id'] = self.spider_conf['source_id']
        review['DBaseCategoryName'] = 'PRO'

        if meta_info.get('conclusion', ''):
            review['TestVerdict'] = meta_info.get('conclusion', '')
        if meta_info.get('reviewer', ''):
            review['Author'] = meta_info.get('reviewer', '')

        return review
コード例 #3
0
    def level_4(self, response):
                                     
        original_url = response.url
        
        category_leaf_xpath = "//ol[@class='breadcrumbs']//ol/li[last()]//a//span//text()"
        category_path_xpath = "//ol[@class='breadcrumbs']//span//text()"
        category = CategoryItem()
        category['category_url'] = original_url
        category['category_leaf'] = self.extract(response.xpath(category_leaf_xpath))
        category['category_path'] = self.extract_all(response.xpath(category_path_xpath), ' | ')
        if self.should_skip_category(category):
            return
        yield category

        product_xpaths = { 
                
                "source_internal_id": "//div[@class='overviewHeaderTitle']//h1/a/@href",
                
                
                "ProductName":"//div[@class='overviewHeaderTitle']//h1/a//text()",
                
                
                "OriginalCategoryName":"//ol[@class='breadcrumbs']//span//text()",
                
                
                "PicURL":"//div[@class='headerContent']//img/@src",
                
                
                            }
        product = self.init_item_by_xpaths(response, "product", product_xpaths)
        product['TestUrl'] = original_url
        picurl = product.get("PicURL", "")
        if picurl and picurl[:2] == "//":
            product["PicURL"] = "https:" + product["PicURL"]
        if picurl and picurl[:1] == "/":
            product["PicURL"] = get_full_url(original_url, picurl)
        manuf = product.get("ProductManufacturer", "")
        if manuf == "" and ""[:2] != "//":
            product["ProductManufacturer"] = ""
        try:
            product["OriginalCategoryName"] = category['category_path']
        except:
            pass
        ocn = product.get("OriginalCategoryName", "")
        if ocn == "" and "//ol[@class='breadcrumbs']//span//text()"[:2] != "//":
            product["OriginalCategoryName"] = "//ol[@class='breadcrumbs']//span//text()"

        matches = None
        field_value = product.get("source_internal_id", "")
        if field_value:
            matches = re.search("((?<=/)\d+(?=/))", field_value, re.IGNORECASE)
        if matches:
            product["source_internal_id"] = matches.group(1)
                                    

        yield product


        
        button_next_url = ""
        if "//div[contains(@class,'paging-footer')]//a[contains(@class,'next')]/@href":
            button_next_url = self.extract(response.xpath("//div[contains(@class,'paging-footer')]//a[contains(@class,'next')]/@href"))
        if button_next_url:
            button_next_url = get_full_url(original_url, button_next_url)
            request = Request(button_next_url, callback=self.level_4)
            
            yield request

        containers_xpath = "//ul[@class='reviewList']/li[@class='review']"
        containers = response.xpath(containers_xpath)
        for review_container in containers:
            review = ReviewItem()
            
            review['source_internal_id'] = self.extract(response.xpath("//div[@class='overviewHeaderTitle']//h1/a/@href"))
            
            
            review['ProductName'] = self.extract(review_container.xpath("//div[@class='overviewHeaderTitle']//h1/a//text()"))
            
            
            review['SourceTestRating'] = self.extract(review_container.xpath(".//div[@class='reviewAverageRating']//meter/@value"))
            
            
            review['TestDateText'] = self.extract(review_container.xpath(".//span[@class='writeDate']//time//text()"))
            
            
            review['TestPros'] = self.extract(review_container.xpath(".//div[@class='pros']//ul/li//text()"))
            
            
            review['TestCons'] = self.extract(review_container.xpath(".//div[@class='cons']//ul/li//text()"))
            
            
            review['TestSummary'] = self.extract(review_container.xpath(".//div[contains(@class,'reviewText')]//p[count(br)=0]/text() | .//div[contains(@class,'reviewText')]//br[position()=1]/preceding-sibling::text()[1]"))
            
            
            
            review['Author'] = self.extract(review_container.xpath(".//div[@class='reviewWriter']/strong//text()"))
            
            
            review['TestTitle'] = self.extract(review_container.xpath(".//div[@class='reviewContent']/h3/a//text()"))
            
            
            
            review['TestUrl'] = original_url
            try:
                review['ProductName'] = product['ProductName']
                review['source_internal_id'] = product['source_internal_id']
            except:
                pass
            awpic_link = review.get("AwardPic", "")
            if awpic_link and awpic_link[:2] == "//":
                review["AwardPic"] = "https:" + review["AwardPic"]
            if awpic_link and awpic_link[:1] == "/":
                review["AwardPic"] = get_full_url(original_url, awpic_link)
        

           

            
            matches = None
            field_value = review.get("source_internal_id", "")
            if field_value:
                matches = re.search("((?<=/)\d+(?=/))", field_value, re.IGNORECASE)
            if matches:
                review["source_internal_id"] = matches.group(1)
            
                                    

            
            review["SourceTestScale"] = "5"
             
                                    

            
            if review["TestDateText"]:
                
                review["TestDateText"] = date_format(review["TestDateText"], "%d %B %Y", ["nl"])
            
                                    

            
            review["DBaseCategoryName"] = "USER"
            
                                    

        
            
                            
            yield review