Example #1
0
    def parse_product(self, response):
        review_url_xpath = "//div[@class='reviews']/p[@class='links ftr']/a/@href"

        _product = extruct_helper.product_items_from_microdata(
            response, response.meta['category'])
        if not _product:
            request = self._retry(response.request)
            yield request
            return

        product = _product.get('product')
        yield product

        product_ids = _product.get('product_ids')
        for product_id in product_ids:
            yield product_id

        review_url = self.extract(response.xpath(review_url_xpath))
        if review_url:
            review_url = get_full_url(response, review_url)
            last_user_review = incremental_utils.get_latest_user_review_date_by_sii(
                self.mysql_manager, self.spider_conf['source_id'],
                product["source_internal_id"])

            request = Request(url=review_url, callback=self.parse_review)
            request.meta['product'] = product
            request.meta['last_user_review'] = last_user_review
            yield request
Example #2
0
    def parse_product(self, response):
        items = extruct_helper.get_microdata_extruct_items(response.body_as_unicode())
        category = response.meta['category']
        product = list(extruct_helper.get_products_microdata_extruct(items, response, category))
        if len(product) != 1:
            raise Exception("Could not extract product in %s" % response.url)
        product_dict = product[0]
        product = product_dict['product']
        product['ProductManufacturer'] = self.extract(response.xpath("//meta[contains(@property, 'product:brand')]/@content"))
        yield product

        for product_id in product_dict['product_ids']:
            yield product_id

        bv_params = self.bv_base_params.copy()
        bv_params['bv_id'] = product['source_internal_id']
        bv_params['offset'] = 0
        review_url = self.get_review_url(**bv_params)
        request = Request(url=review_url, callback=self.parse_reviews)
        last_user_review = incremental_utils.get_latest_user_review_date_by_sii(
            self.mysql_manager, self.spider_conf['source_id'],
            product["source_internal_id"]
        )
        request.meta['last_user_review'] = last_user_review
        request.meta['bv_id'] = product['source_internal_id']
        request.meta['product'] = product
        request.meta['filter_other_sources'] = False

        yield request
Example #3
0
    def parse_category(self, response):
        next_page_url_xpath = "//*[@rel='next']/@href"
        products_xpath = "//div[contains(@class, 'item') and .//*[@class='stelline']]"
        product_id_xpath = ".//*[@name='sku']/@value"
        bv_id_xpath = ".//*[@class='stelline']/@id"

        products = response.xpath(products_xpath)

        if not products:
            sub_cat_xpath = "//div[@class='box_menu']//li/a/@href"
            sub_cat_urls = self.extract_list(response.xpath(sub_cat_xpath))
            for url in sub_cat_urls:
                yield response.follow(url, callback=self.parse_category)

            return

        category = response.meta.get('category', {})
        if not category:
            category_path_xpath = "//span[@class='path']/*[@itemprop='name']//text()"
            category = CategoryItem()
            category['category_url'] = response.url
            category['category_path'] = self.extract_all(
                response.xpath(category_path_xpath), separator=' | ')
            yield category

        if self.should_skip_category(category):
            return

        for product in products:
            source_internal_id = self.extract(product.xpath(product_id_xpath))
            bv_id = product.xpath(bv_id_xpath).re_first(r'-([0-9]+)')
            if not (source_internal_id and bv_id):
                continue

            bv_params = self.bv_base_params.copy()
            bv_params['bv_id'] = bv_id
            bv_params['offset'] = 0
            review_url = self.get_review_url(**bv_params)

            request = Request(url=review_url, callback=self.parse_reviews)
            last_user_review = incremental_utils.get_latest_user_review_date_by_sii(
                self.mysql_manager, self.spider_conf['source_id'],
                source_internal_id)
            request.meta['last_user_review'] = last_user_review
            request.meta['bv_id'] = bv_id
            request.meta['product_id'] = source_internal_id
            request.meta['OriginalCategoryName'] = category.get(
                'category_path')

            request.meta['filter_other_sources'] = False

            yield request

        next_page_url = self.extract_xpath(response, next_page_url_xpath)
        if next_page_url:
            next_page_url = get_full_url(response, next_page_url)
            request = Request(next_page_url, callback=self.parse_category)
            request.meta['category'] = category
            yield request
Example #4
0
    def parse_review(self, response):
        next_page_xpath = "(//*[@rel='next']/@href)[1]"
        default_rating_xpath = './/reevoo-score/@data-score'

        product = response.meta['product']
        reviews = response.xpath('//article[contains(@id,"review_")]')

        if not reviews:
            return

        # From observation, at least currys.co.uk uses a different format to present review rating
        rating_xpath = response.meta.get('rating_xpath', '')
        if not rating_xpath:
            rating_xpath = default_rating_xpath

        last_user_review = incremental_utils.get_latest_user_review_date_by_sii(
            self.mysql_manager, self.spider_conf['source_id'],
            product["source_internal_id"])

        for review in reviews:
            user_review = ReviewItem()
            date = self.extract(
                review.xpath(
                    './/span[contains(@class, "date_publish")]/text()'))
            if date:
                user_review['TestDateText'] = date_format(date, '')
                current_user_review = dateparser.parse(
                    user_review['TestDateText'], date_formats=['%Y-%m-%d'])
                if current_user_review < last_user_review:
                    return

            user_review['DBaseCategoryName'] = "USER"
            user_review['ProductName'] = product['ProductName']
            user_review['TestUrl'] = product['TestUrl']
            user_review['SourceTestRating'] = self.extract(
                review.xpath(rating_xpath))
            user_review['Author'] = self.extract(
                review.xpath('.//h4[@class="attribution-name"]/text()'))
            user_review['TestPros'] = self.extract_all(
                review.xpath('.//dd[@class="pros"]/text()'))
            user_review['TestCons'] = self.extract_all(
                review.xpath('.//dd[@class="cons"]/text()'))
            user_review['source_internal_id'] = product['source_internal_id']

            # All reviews after first empty review are empty
            if user_review['TestPros'] or user_review['TestCons']:
                yield user_review
            else:
                return

        next_page_url = self.extract(response.xpath(next_page_xpath))
        if next_page_url:
            next_page_url = get_full_url(response, next_page_url)
            request = Request(next_page_url,
                              callback=self.parse_review,
                              meta=response.meta)
            yield request
Example #5
0
    def parse(self, response):
        product_data = self.extract(response.xpath("//script[@type='text/javascript']"
                                                   "[contains(text(),'hof.data')]/text()"))
        product_id_re = r'"FriendlyProductId":"([0-9]+)"'

        current_page = response.meta.get('page_number')
        category = response.meta.get('category', '')
        original_url = response.meta.get('original_url', '')

        if not category:
            # We should be able to spot category name from the URL.
            # Otherwise we will need to parse category name from JavaScript,
            # as the site loads its product pages using knockout.js
            category = CategoryItem()
            category['category_path'] = response.url
            category['category_url'] = response.url
            yield category

            if self.should_skip_category(category):
                return

            current_page = 1
            original_url = response.url

        product_ids = re.findall(product_id_re, product_data)
        if not product_ids:
            return

        for product_id in product_ids:

            bv_params = self.bv_base_params.copy()
            bv_params['bv_id'] = product_id
            bv_params['offset'] = 0
            review_url = self.get_review_url(**bv_params)

            request = Request(url=review_url, callback=self.parse_reviews)
            last_user_review = incremental_utils.get_latest_user_review_date_by_sii(
                self.mysql_manager, self.spider_conf['source_id'],
                product_id
            )
            request.meta['last_user_review'] = last_user_review
            request.meta['bv_id'] = product_id
            request.meta['product_id'] = product_id
            request.meta['OriginalCategoryName'] = category.get('category_path')

            request.meta['filter_other_sources'] = False

            yield request

        next_page = current_page + 1
        next_page_url = original_url + '?page={}'.format(next_page)
        next_page_request = Request(url=next_page_url, callback=self.parse)
        next_page_request.meta['page_number'] = next_page
        next_page_request.meta['category'] = category
        next_page_request.meta['original_url'] = original_url
        yield next_page_request
Example #6
0
    def parse_product(self, response):
        product_xpaths = {
            "PicURL": "//meta[@property='og:image']/@content",
            "ProductName": "//h1[@class='productHeading']//text()",
            "ProductManufacturer": "//h1[@class='productHeading']/text()"
        }

        product = self.init_item_by_xpaths(response, "product", product_xpaths)

        match = re.search(self.source_internal_id_re, response.url)
        if match:
            product['source_internal_id'] = match.group(1)

        product['TestUrl'] = response.url
        product["OriginalCategoryName"] = response.meta["category"][
            "category_path"]
        yield product

        mpn_value = self.extract(
            response.xpath("//span[@id='productMPN']/text()"))
        if mpn_value:
            mpn = ProductIdItem()
            mpn['source_internal_id'] = product["source_internal_id"]
            mpn['ProductName'] = product["ProductName"]
            mpn['ID_kind'] = "MPN"
            mpn['ID_value'] = mpn_value
            yield mpn

        ean_value = self.extract(
            response.xpath("//span[@id='productEAN']/text()"))
        if ean_value:
            ean = ProductIdItem()
            ean['source_internal_id'] = product["source_internal_id"]
            ean['ProductName'] = product["ProductName"]
            ean['ID_kind'] = "EAN"
            ean['ID_value'] = ean_value
            yield ean

        bv_params = self.bv_base_params.copy()
        bv_params['bv_id'] = product['source_internal_id']
        bv_params['offset'] = 0
        review_url = self.get_review_url(**bv_params)
        request = Request(url=review_url, callback=self.parse_reviews)

        last_user_review = incremental_utils.get_latest_user_review_date_by_sii(
            self.mysql_manager, self.spider_conf['source_id'],
            product["source_internal_id"])
        request.meta['last_user_review'] = last_user_review

        request.meta['bv_id'] = product['source_internal_id']
        request.meta['product'] = product
        request.meta['filter_other_sources'] = False

        yield request
Example #7
0
    def parse_reviews(self, response):
        product = response.meta['product']
        rating_xpath = ".//*[@class='review--header-rating']/text()"
        title_xpath = ".//h3[contains(@class, 'review--header-title')]/text()"
        summary_xpath = ".//div[contains(@class, 'review--description')]//text()"
        header_xpath = ".//div[@class='review--header-review-info']//text()"
        date_xpath =  ".//div[@class='review--header-review-info']/time/@datetime"

        pros_xpath = ".//li[contains(@class, 'pros-and-cons-pro')]//*[@class!='is-visually-hidden']/text()"
        cons_xpath = ".//li[contains(@class, 'pros-and-cons-con')]//*[@class!='is-visually-hidden']/text()"

        next_page_xpath = "//a[@rel='next']/@href"
        reviews = response.xpath("//li[contains(@class, 'reviews__list-item')]")

        last_user_review = incremental_utils.get_latest_user_review_date_by_sii(
            self.mysql_manager, self.spider_conf['source_id'],
            product["source_internal_id"]
        )

        for review in reviews:
            date = self.extract_xpath(review, date_xpath)
            if date:
                date = date_format(date, '')
                current_user_review = dateparser.parse(date,
                                                       date_formats=['%Y-%m-%d'])
                if current_user_review < last_user_review:
                    return

            title = self.extract_xpath(review, title_xpath)
            rating = self.extract_xpath(review, rating_xpath)
            splitted = rating.split('/')
            if splitted:
                rating = splitted[0]

            summary = self.extract_all_xpath(review, summary_xpath)
            pros = self.extract_all_xpath(review, pros_xpath, separator=' ; ')
            cons = self.extract_all_xpath(review, cons_xpath, separator=' ; ')
            author = ''
            header = self.extract_all_xpath(review, header_xpath)
            if header:
                author = header.split('|')
                author = strip(author[0])

            user_review = ReviewItem.from_product(product=product, tp='USER', rating=rating,
                                                  title=title, date=date, summary=summary,
                                                  pros=pros, cons=cons, author=author, scale=10)
            yield user_review

        next_page_url = self.extract_xpath(response, next_page_xpath)
        if next_page_url:
            next_page_url = get_full_url(response, next_page_url)
            request = Request(next_page_url, callback=self.parse_reviews)
            request.meta['product'] = product
            yield request
Example #8
0
    def parse_product(self, response):
        source_internal_id_re = r'(\d+)'

        _product = extruct_helper.product_items_from_microdata(
            response, response.meta['category'])
        if not _product:
            request = self._retry(response.request)
            yield request
            return

        product = _product.get('product')
        product_ids = _product.get('product_ids', [])

        if not (product and product_ids):
            self.logger.info("Could not scrape product at %s" % response.url)
            return

        # unfortunately, we need to clean up the source_internal_id for this source
        match = re.search(source_internal_id_re, product['source_internal_id'])
        if match:
            product['source_internal_id'] = match.group(1)

        for product_id in product_ids:
            product_id['source_internal_id'] = product['source_internal_id']
            if product_id['ID_kind'] == 'sku':
                match = re.search(source_internal_id_re,
                                  product_id['ID_value'])
                if match:
                    product_id['ID_value'] = match.group(1)
            yield product_id

        bv_id = product['source_internal_id']

        if product['ProductName'] and product['source_internal_id'] and bv_id:
            yield product

            bv_params = self.bv_base_params.copy()
            bv_params['bv_id'] = bv_id
            bv_params['offset'] = 0
            review_url = self.get_review_url(**bv_params)
            request = Request(url=review_url, callback=self.parse_reviews)

            last_user_review = incremental_utils.get_latest_user_review_date_by_sii(
                self.mysql_manager, self.spider_conf['source_id'],
                product["source_internal_id"])
            request.meta['last_user_review'] = last_user_review
            request.meta['filter_other_sources'] = False

            request.meta['bv_id'] = bv_id
            request.meta['product'] = product
            yield request
        else:
            self.logger.info("Could not scrape product at %s" % response.url)
    def parse_user_reviews(self, response):
        # Featured reviews are always at the top. We cannot do incremental scraping before finishing
        # parsing all featured reviews, as there may be newer regular reviews following them.
        product = response.meta['product']
        reviews_xpath = "//ul[@class='userlist']/li[@id]"
        user_review_content_xpaths = {
            "TestTitle": ".//p[@class='reviewTitle']/text()",
            "Author": ".//*[@class='reviewedBy']/a/text()",
            "SourceTestRating": ".//*[@class='ratingStarSmall']/text()"
        }

        review_summary_xpath = ".//div[@class='reviewText']/span[@class='smallContent']/text()"
        review_summary_xpath_part2 = ".//div[@class='reviewText']//span[@class='moreReview']/text()"

        next_page_xpath = "//li[@id='pagNext']/a/@href"

        last_user_review = incremental_utils.get_latest_user_review_date_by_sii(
            self.mysql_manager, self.spider_conf['source_id'],
            product["source_internal_id"])

        review_selectors = response.xpath(reviews_xpath)
        for review_selector in review_selectors:
            is_featured_review = 'featured' in self.extract(
                review_selector.xpath('./@class')).lower()
            review = self.parse_review(response, product,
                                       user_review_content_xpaths, 'USER',
                                       review_selector)

            # incremental scraping
            if review.get('TestDateText', '') and not is_featured_review:
                current_user_review = dateparser.parse(
                    review['TestDateText'], date_formats=['%Y-%m-%d'])
                if current_user_review < last_user_review:
                    return

            # If we fail to get summary, then an exception will be thrown
            review['TestSummary'] = review_selector.xpath(
                review_summary_xpath).extract_first()
            summary_part2 = review_selector.xpath(
                review_summary_xpath_part2).extract_first()
            if summary_part2:
                review['TestSummary'] += summary_part2
            review['TestSummary'] = review['TestSummary'].strip()

            yield review

        next_page_url = self.extract(response.xpath(next_page_xpath))
        if next_page_url:
            next_page_url = get_full_url(response, next_page_url)
            next_page_request = Request(next_page_url,
                                        callback=self.parse_user_reviews,
                                        meta=response.meta)
            yield next_page_request
Example #10
0
    def parse_product(self, response):
        category = response.meta['category']
        soup = BeautifulSoup(response.body, "lxml")
        item_id = response.url.split('/')[-1].strip()
        product = ProductItem()
        product['source_internal_id'] = item_id
        product['ProductName'] = soup.find('h1', {
            'itemprop': 'name'
        }).text.strip()
        product['ProductManufacturer'] = soup.find('a', {
            'id': 'WMItemBrandLnk'
        }).text.strip() if soup.find('a', {'id': 'WMItemBrandLnk'}) else ''

        product['OriginalCategoryName'] = category['category_path']
        product['PicURL'] = soup.find(
            'img', {'class': 'product-image'})['src'].strip()
        product['TestUrl'] = response.url
        yield product

        price = soup.find('div', {'itemprop': 'price'})
        product_id = ProductIdItem()
        product_id['source_id'] = product['source_id']
        product_id['ProductName'] = product['ProductName']
        product_id['source_internal_id'] = product['source_internal_id']
        if price:
            try:
                product_id['ID_kind'] = 'price'
                product_id['ID_value'] = format(
                    round(float(''.join(price.text.replace('$', ''))), 2),
                    ".2f").replace('.', ',')
            except:
                pass
        yield product_id

        latest_review_date = get_latest_user_review_date_by_sii(
            self.mysql_manager, self.spider_conf['source_id'], item_id)

        review_page = 1
        reviews_link = reviews_link_pattern % (item_id, str(review_page))
        request = Request(reviews_link, callback=self.parse_review)
        request.meta['ProductName'] = product['ProductName']
        request.meta['item_id'] = item_id
        request.meta['review_page'] = review_page
        request.meta['latest_review_date'] = latest_review_date
        anchors = soup.find_all('a', {'class': 'js-product-anchor'})
        for anchor in anchors:
            if 'reviews' in anchor.text:
                request.meta['max_idx'] = int(
                    anchor.text.replace('reviews', '').strip())
                break
        yield request
Example #11
0
    def parse_product(self, response):
        product_xpaths = {"PicURL": "(//*[@property='og:image'])[1]/@content",
                          "ProductName": "//h1//text()",
                          "OriginalCategoryName": "//li[contains(@class, 'item category')][last()]/a/text()",
                          "ProductManufacturer":  "//th[@class='col label' and text()='Brand']/"
                                                  "following-sibling::*/text()"
                          }

        product = self.init_item_by_xpaths(response, "product", product_xpaths)
        bv_config_data = self.extract(response.xpath("//script[@type='text/javascript']"
                                                     "[contains(text(),'productId')]/text()"))

        if product.get('OriginalCategoryName', ''):
            category = CategoryItem()
            category_url = self.extract(response.xpath("//li[contains(@class, 'item category')][last()]/a/@href"))
            category['category_url'] = get_full_url(response, category_url)
            category['category_leaf'] = product['OriginalCategoryName']
            category['category_path'] = category['category_leaf']
            yield category

        match = re.search(self.source_internal_id_re, bv_config_data)
        if match:
            product["source_internal_id"] = match.group(1).upper()

            product_id = ProductIdItem()
            product_id['source_internal_id'] = product["source_internal_id"]
            product_id['ProductName'] = product["ProductName"]
            product_id['ID_kind'] = "richersounds_id"
            product_id['ID_value'] = product["source_internal_id"]
            yield product_id
            yield product

            bv_params = self.bv_base_params.copy()
            bv_params['bv_id'] = product['source_internal_id']
            bv_params['offset'] = 0
            review_url = self.get_review_url(**bv_params)

            request = Request(url=review_url, callback=self.parse_reviews)

            last_user_review = incremental_utils.get_latest_user_review_date_by_sii(
                self.mysql_manager, self.spider_conf['source_id'],
                product["source_internal_id"]
            )
            request.meta['last_user_review'] = last_user_review

            request.meta['bv_id'] = product['source_internal_id']
            request.meta['product'] = product
            request.meta['filter_other_sources'] = False

            yield request
Example #12
0
    def parse_product(self, response):
        product_name_xpath = "//h1[@itemprop='name']/text()"
        product_id_xpath = "//div[@class='productid']/text()"
        manufacturer_xpath = "//input[@id='productManufacturerName']/@value"
        bv_id_xpath = "//input[@id='product_ID']/@value"

        product = ProductItem()
        product['TestUrl'] = response.url
        product['OriginalCategoryName'] = response.meta['OriginalCategoryName']
        product['source_internal_id'] = self.extract(
            response.xpath(product_id_xpath))
        product['ProductName'] = self.extract(
            response.xpath(product_name_xpath))
        product['ProductManufacturer'] = self.extract(
            response.xpath(manufacturer_xpath))

        bv_id = self.extract(response.xpath(bv_id_xpath))

        if product['ProductName'] and product['source_internal_id'] and bv_id:
            yield product

            product_id = self.product_id(product)
            product_id['ID_kind'] = "boots_com_id"
            product_id['ID_value'] = product['source_internal_id']
            yield product_id

            bv_params = self.bv_base_params.copy()
            bv_params['bv_id'] = bv_id
            bv_params['offset'] = 0
            review_url = self.get_review_url(**bv_params)
            request = Request(url=review_url, callback=self.parse_reviews)

            last_user_review = incremental_utils.get_latest_user_review_date_by_sii(
                self.mysql_manager, self.spider_conf['source_id'],
                product["source_internal_id"])
            request.meta['last_user_review'] = last_user_review
            request.meta['filter_other_sources'] = False

            request.meta['bv_id'] = bv_id
            request.meta['product'] = product
            yield request
        else:
            self.logger.info("Could not scrape product at %s" % response.url)
Example #13
0
    def parse_reviews(self, response):
        next_page_xpath = '//a[@class="next-arrow"]/@href'

        product = response.meta['product']
        last_user_review = incremental_utils.get_latest_user_review_date_by_sii(
            self.mysql_manager, self.spider_conf['source_id'],
            product["source_internal_id"]
        )

        reviews = response.xpath('//div[contains(@class,"reviewWidget")]')
        for review in reviews:
            user_review = ReviewItem()
            date = self.extract(review.xpath('.//span[@class="reviewDate"]/text()'))
            if date:
                user_review['TestDateText'] = date_format(date, '')
                current_user_review = dateparser.parse(user_review['TestDateText'],
                                                       date_formats=['%Y-%m-%d'])
                if current_user_review < last_user_review:
                    return

            user_review['DBaseCategoryName'] = "USER"
            user_review['SourceTestScale'] = 5
            user_review['ProductName'] = product['ProductName']
            user_review['TestUrl'] = product['TestUrl']
            user_review['source_internal_id'] = product['source_internal_id']
            rating = self.extract(review.xpath('.//span[contains(@class,"ratingSpriteUnder")]/@class'))
            rating = rating.strip('ratingSpriteUnder ratingSprite_').replace('-', '.')
            user_review['SourceTestRating'] = rating
            user_review['Author'] = self.extract(review.xpath('.//p[@class="name"]/text()'))
            user_review['TestTitle'] = self.extract(review.xpath('.//h2/text()'))
            user_review['TestSummary'] = self.extract_all(review.xpath('.//div[@class="reviewContainer"]/p/text()'))
            user_review['TestPros'] = self.extract_all(review.xpath('.//ul[@class="pros"]/li/text()'), '; ')
            user_review['TestCons'] = self.extract_all(review.xpath('.//ul[@class="cons"]/li/text()'), '; ')
            yield user_review

        next_page_url = self.extract_xpath(response, next_page_xpath)
        if next_page_url:
            next_page_url = get_full_url(response, next_page_url)
            request = Request(next_page_url, callback=self.parse_reviews)
            request.meta['product'] = product
            yield request
Example #14
0
    def parse_product(self, response):
        sii_re = '-([^\-]+).html'
        product = ProductItem()

        product['TestUrl'] = response.url.split('#')[0]
        product['OriginalCategoryName'] = response.meta['category'][
            'category_path']
        product['ProductName'] = self.extract(response.xpath('//h1/text()'))
        product['PicURL'] = self.extract(
            response.xpath('//a[@itemprop="image"]/@href'))
        product['ProductManufacturer'] = self.extract(
            response.xpath('//span[@itemprop="brand"]/a/span/text()'))

        match = re.search(sii_re, response.url)
        if not match:
            return
        source_internal_id = match.group(1)
        product['source_internal_id'] = source_internal_id
        yield product

        review_xpath = "//ul[@class='pagNum']/@data-action"
        total_page_xpath = "//ul[@class='pagNum']/li[@class='next']/preceding-sibling::li[1]/text()"

        review_url = self.extract_xpath(response, review_xpath)
        total_pages = self.extract_xpath(response, total_page_xpath)
        if not total_pages:
            total_pages = 1
        latest_db_date = get_latest_user_review_date_by_sii(
            self.mysql_manager, self.spider_conf["source_id"],
            source_internal_id)
        if review_url:
            set_query_parameter(review_url, 'ReviewOrdering', '2')
            review_url = get_full_url(response, review_url)
            request = Request(url=review_url, callback=self.parse_reviews)
            request.meta['product'] = product
            request.meta['current_page'] = 1
            if total_pages:
                request.meta['total_pages'] = total_pages
            request.meta['latest_db_date'] = latest_db_date
            yield request
    def parse(self, response):
        #Product
        product_xpaths = {
            "ProductName":
            "//h1[contains(@class,'title')]/span[@itemprop='name']/text()",
        }
        product = self.init_item_by_xpaths(response, "product", product_xpaths)
        product['source_internal_id'] = self.extract(
            response.xpath("//*[@itemprop='sku']/text()"))

        #Category
        category_leaf_xpath = "(//a[contains(@itemprop,'url')]/span[contains(@itemprop,'title')])[last()]/text()"
        category_path_xpath = "(//a[contains(@itemprop,'url')]/span[contains(@itemprop,'title')])/text()"
        category = CategoryItem()
        category['category_leaf'] = self.extract(
            response.xpath(category_leaf_xpath))
        category['category_path'] = self.extract_all(
            response.xpath(category_path_xpath), ' | ')
        #product's OriginalCategoryName should always match category_path of the corresponding category item
        product['OriginalCategoryName'] = category['category_path']

        yield product
        yield category

        #Review
        bv_params = self.bv_base_params.copy()
        bv_params['bv_id'] = product['source_internal_id']
        bv_params['offset'] = 0
        review_url = self.get_review_url(**bv_params)
        request = Request(url=review_url, callback=self.parse_reviews)

        last_user_review = incremental_utils.get_latest_user_review_date_by_sii(
            self.mysql_manager, self.spider_conf['source_id'],
            product["source_internal_id"])
        request.meta['last_user_review'] = last_user_review
        request.meta['bv_id'] = product['source_internal_id']
        request.meta['product'] = product
        request.meta['filter_other_sources'] = False
        yield request
Example #16
0
    def parse_product(self, response):

        product = ProductItem()
        product['TestUrl'] = response.url
        product[
            'OriginalCategoryName'] = 'Cell phones'  #self.extract_all(response.xpath('//ol[@id="breadcrumb-list"]/li/a/text()'),"->")

        productname_xpath = '//h1/span[@itemprop="name"]//text()'
        picurl_xpath = '//img[@id="Image1x"]//@src'
        manufactor_xpath = '//h1/span[@itemprop="brand"]//text()'
        ean_xpath = '//meta[@itemprop="gtin13"]/@content'

        product['ProductName'] = self.extract(
            response.xpath(productname_xpath))
        product['PicURL'] = get_full_url(
            response, self.extract(response.xpath(picurl_xpath)))
        product['ProductManufacturer'] = self.extract(
            response.xpath(manufactor_xpath))
        product['source_internal_id'] = self.extract(response.xpath(ean_xpath))
        yield product

        bv_params = self.bv_base_params.copy()
        bv_params['bv_id'] = product['source_internal_id']
        bv_params['offset'] = 0
        review_url = self.get_review_url(**bv_params)
        request = Request(url=review_url, callback=self.parse_reviews)

        last_user_review = incremental_utils.get_latest_user_review_date_by_sii(
            self.mysql_manager, self.spider_conf['source_id'],
            product["source_internal_id"])
        request.meta['last_user_review'] = last_user_review

        request.meta['bv_id'] = product['source_internal_id']
        request.meta['product'] = product
        request.meta['extra_parser'] = self.final_review_parser

        request.meta['filter_other_sources'] = False

        yield request
Example #17
0
    def call_review(self, response, product=None, incremental=True):
        bv_id = response.meta.get('bv_id', None)

        if not bv_id:
            bv_id_xpath = "//div/@data-product-id"
            bv_id = self.extract_xpath(response, bv_id_xpath)

        if incremental:
            last_user_review = incremental_utils.get_latest_user_review_date_by_sii(
                self.mysql_manager, self.spider_conf['source_id'],
                product["source_internal_id"])
            response.meta['last_user_review'] = last_user_review

        bv_params = self.bv_base_params.copy()
        bv_params['bv_id'] = bv_id
        bv_params['offset'] = 0
        fullUrl = self.FULL_URL_PATTERN.format(**bv_params)

        response.meta['product'] = product
        response.meta['bv_id'] = bv_id
        request = Request(fullUrl,
                          callback=self.parse_reviews,
                          meta=response.meta)
        yield request
Example #18
0
    def parse_category(self, response):
        products_xpath = "//div[@class='Productlist']/div"
        product_sku_xpath = "./@data-sku"
        has_review_xpath = ".//span[@class='Rating-average']"
        next_page_xpath = "(//*[@rel='next'])[1]/@href"

        category = response.meta.get('category', '')
        if not category:
            # the category we get here is actually parent category
            category_json_ld = extruct_helper.extract_json_ld(
                response.text, 'BreadcrumbList')
            if not category_json_ld:
                request = self._retry(response.request)
                yield request
                return

            category = extruct_helper.category_item_from_breadcrumbs_json_ld(
                category_json_ld)
            current_category_name = self.extract(
                response.xpath(
                    "//div[@id='breadcrumb']/ul/li[@class='pad-left']/text()"))
            if current_category_name.lower(
            ) != category['category_leaf'].lower():
                category['category_leaf'] = current_category_name
                category['category_path'] = u'{} | {}'.format(
                    category['category_path'], current_category_name)
                category['category_url'] = response.url

            yield category

            if self.should_skip_category(category):
                return

        products = response.xpath(products_xpath)

        # Not a leaf category page
        if not products:
            return

        # We skip the product page, as feelunique.com tries to block us if we access too many of their pages,
        # but it is impossible for them to block the access to Bazaarvoice API
        for product in products:
            has_review = product.xpath(has_review_xpath)
            if not has_review:
                continue

            product_sku = self.extract(product.xpath(product_sku_xpath))
            if product_sku:
                product_id = ProductIdItem()
                product_id['source_internal_id'] = product_sku
                product_id['ID_kind'] = 'feelunique_internal_id'
                product_id['ID_value'] = product_sku
                yield product_id

                bv_params = self.bv_base_params.copy()
                bv_params['bv_id'] = product_sku
                bv_params['offset'] = 0
                review_url = self.get_review_url(**bv_params)

                last_user_review = incremental_utils.get_latest_user_review_date_by_sii(
                    self.mysql_manager, self.spider_conf['source_id'],
                    product_sku)

                request = Request(review_url, callback=self.parse_reviews)
                request.meta['last_user_review'] = last_user_review
                request.meta['filter_other_sources'] = False
                request.meta['OriginalCategoryName'] = category[
                    'category_path']
                request.meta['bv_id'] = product_sku
                yield request
            else:
                product_url_xpath = "./a/@href"
                product_url = self.extract(response.xpath(product_url_xpath))
                product_url = get_full_url(response, product_url)
                self.logger.info("Failed to get SKU for product at %s" %
                                 product_url)

        next_page_url = self.extract_xpath(response, next_page_xpath)
        if next_page_url:
            next_page_url = get_full_url(response, next_page_url)
            next_page_request = Request(next_page_url,
                                        callback=self.parse_category)
            next_page_request.meta['category'] = category
            yield next_page_request
Example #19
0
    def parse_product(self, response):
        # sku
        # --------------------------------------------------------------------
        sku_xpath = "//meta[@name='Product-Article-Number']/@content"
        sku = self.extract_xpath(response, sku_xpath)
        if not sku:
            # not a product page
            return

        # set up product item
        # --------------------------------------------------------------------
        product_xpaths = {
            "ProductName":
            "//meta[@name='productName']/@content",
            "OriginalCategoryName":
            "//meta[@name='Product-Sub-Category']/@content",
        }
        product = self.init_item_by_xpaths(response, "product", product_xpaths)
        product['source_internal_id'] = sku
        # all products are from canon.nl
        product['ProductManufacturer'] = 'Canon'

        # product picture
        # --------------------------------------------------------------------
        pic_url_xpath = "//meta[@name='Product-Image-Large']/@content"
        pic_url_xpath_alt = "//meta[@name='Product-Image-Small']/@content"
        # only got a relative url from Xptah
        pic_url = self.extract(response.xpath(pic_url_xpath))
        if not pic_url:
            pic_url = self.extract(response.xpath(pic_url_xpath_alt))
            print('got pic_url')
        product['PicURL'] = get_full_url(response, pic_url)

        # double check product name
        # --------------------------------------------------------------------
        product_name = product.get('ProductName')
        if not product_name:
            product_name_xpath = "//meta[@name='og:title']/@content"
            product_name = self.extract(response.xpath(product_name_xpath))
            if product_name:
                product['ProductName'] = product_name

        # double check OriginalCategoryName
        # --------------------------------------------------------------------
        original_category = product.get('OriginalCategoryName')
        if not original_category:
            '''
            a typical canon.nl product page has category names in their url
            for example:
                https://www.canon.nl/for_home/product_finder/printers/laser/i-sensys_lbp7750cdn/
                where the last item seperated by '/' is the actual product
                and the two items before the last two
                are helpful for our category matching
            '''
            sep = '|'
            start_category_ind = -4
            end_category_ind = -2
            all_category_names = sep.join(
                response.url.split('/')[start_category_ind:end_category_ind])
            product['OriginalCategoryName'] = all_category_names

        # set up category item
        # --------------------------------------------------------------------
        if product.get('OriginalCategoryName', ''):
            category = CategoryItem()
            category['category_path'] = product['OriginalCategoryName']
            yield category

        # set up product_id item
        # --------------------------------------------------------------------
        product_id = ProductIdItem()
        product_id['source_internal_id'] = product['source_internal_id']
        product_id['ProductName'] = product['ProductName']
        product_id['ID_kind'] = 'canon_id'
        product_id['ID_value'] = product['source_internal_id']
        yield product_id
        yield product

        # set up for bv review
        # --------------------------------------------------------------------
        bv_params = self.bv_base_params.copy()
        bv_params['bv_id'] = product['source_internal_id']
        bv_params['offset'] = 0
        review_url = self.get_review_url(**bv_params)

        request = Request(url=review_url, callback=self.parse_reviews)

        last_user_review = incremental_utils.get_latest_user_review_date_by_sii(
            self.mysql_manager, self.spider_conf['source_id'],
            product["source_internal_id"])
        request.meta['last_user_review'] = last_user_review
        request.meta['bv_id'] = product['source_internal_id']
        request.meta['product'] = product
        request.meta['filter_other_sources'] = False

        yield request
Example #20
0
    def parse_product(self, response):
        mobile_xpath = "//*[@id='mobile_content_bar']"
        mobile = response.xpath(mobile_xpath)
        canonical_url_xpath = "//link[@rel='canonical']/@href"
        if mobile:
            canonical_url = self.extract(response.xpath(canonical_url_xpath))
            request = Request(url=canonical_url, callback=self.parse_product)
            request.meta['category'] = response.meta['category']
            request.meta['review_url'] = response.meta['review_url']
            yield request
            return

        pic_url_xpath = '//img[contains(@class,"s7carousel-main-image-slide-vertical")][1]/@src'
        product_name_xpath = "//div[@class='product-name']//span[@itemprop='name']/text()"
        product_name_alt_xpath = "//div[@id='pdpProduct']/h1/text()"
        product_id_xpath = "//*[@itemprop='sku']/text()"
        #product_id_alt_xpath = "//span[contains(@class,'partnumber')]/text()"
        manufacturer_xpath = "//*[@itemprop='brand']/text()"

        product = ProductItem()
        product['TestUrl'] = response.url
        product['OriginalCategoryName'] = response.meta['OriginalCategoryName']
        product['source_internal_id'] = self.extract(
            response.xpath(product_id_xpath))
        product['ProductName'] = self.extract(
            response.xpath(product_name_xpath))
        product['ProductManufacturer'] = self.extract(
            response.xpath(manufacturer_xpath))
        product['PicURL'] = self.extract(response.xpath(pic_url_xpath))

        if not product['ProductName']:
            product['ProductName'] = self.extract(
                response.xpath(product_name_alt_xpath))

        # the id from alt_xpath is different from the default xpath, do not use it
        # if not product['source_internal_id']:
        #    product['source_internal_id'] = self.extract(response.xpath(product_id_alt_xpath))

        bv_id = ''
        bv_id_re = r'product/([0-9]+)$'
        bv_id_match = re.search(bv_id_re, response.url, re.I)
        if bv_id_match:
            bv_id = bv_id_match.group(1)

        if product['ProductName'] and product['source_internal_id'] and bv_id:
            yield product

            product_id = self.product_id(product)
            product_id['ID_kind'] = "argos_uk_id"
            product_id['ID_value'] = product['source_internal_id']
            yield product_id

            review_url = self.get_review_url(bv_id=bv_id, offset=0)
            request = Request(url=review_url, callback=self.parse_reviews)

            last_user_review = incremental_utils.get_latest_user_review_date_by_sii(
                self.mysql_manager, self.spider_conf['source_id'],
                product["source_internal_id"])
            request.meta['last_user_review'] = last_user_review
            request.meta['filter_other_sources'] = False

            request.meta['bv_id'] = bv_id
            request.meta['product'] = product
            yield request
        else:
            self.logger.info("Could not scrape product at %s" % response.url)
Example #21
0
    def parse_reviews(self, response):
        jstree = js2xml.parse(response.body)
        xml = js2xml.pretty_print(jstree)
        html_xpath = "//var[@name='materials']/object/property[@name='BVRRSourceID']/string/text()"
        html = jstree.xpath(html_xpath)
        if html:
            selector = Selector(text=html[0])

        next_page_xpath = '(//*[contains(@class,"BVRRNextPage")])[1]/a/@data-bvjsref'
        review_list_xpath = '//*[contains(@class,"BVRRContentReview")]'
        from_product_url_xpath = ".//div[contains(@class, 'BVDI_SUAttribution')]//a[@class='BVDILink']/@href"
        from_another_source_xpath = ".//*[contains(@class,'BVRRSyndicatedContentAttribution')]"

        filter_other_sources = response.meta.get('filter_other_sources', None)
        extra_review_parser = response.meta.get('extra_review_parser', None)
        last_user_review = response.meta.get('last_user_review', None)

        product = response.meta['product']
        if not product["source_internal_id"]:
            raise Exception("BV Product without source_internal_id")
        if not last_user_review:
            last_user_review = incremental_utils.get_latest_user_review_date_by_sii(
                self.mysql_manager, self.source_id,
                product["source_internal_id"])

        review_list = selector.xpath(review_list_xpath)
        if not review_list:
            return

        for review_selector in review_list:
            skip_review = False
            if filter_other_sources:
                skip_review = review_selector.xpath(from_another_source_xpath)

            from_product_url = self.extract_xpath(review_selector,
                                                  from_product_url_xpath)
            from_product = True
            if from_product_url:
                from_product = (product["source_internal_id"].lower()
                                in from_product_url.lower())

            review = self._parse_review(product, review_selector,
                                        extra_review_parser)

            if last_user_review:
                current_user_review = datetime.strptime(
                    review['TestDateText'], '%Y-%m-%d')
                if last_user_review > current_user_review:
                    return

            if from_product and not skip_review:
                yield review

        next_page_url = self.extract_xpath(selector, next_page_xpath)
        if next_page_url:
            headers = response.request.headers
            request = Request(next_page_url,
                              callback=self.parse_reviews,
                              headers=headers)
            request.meta['product'] = product
            request.meta['last_user_review'] = last_user_review
            request.meta['filter_other_sources'] = filter_other_sources
            request.meta['extra_review_parser'] = extra_review_parser
            yield request
Example #22
0
    def parse_product(self, response):
        category = response.meta['category']
        items = extruct_helper.get_microdata_extruct_items(
            response.body_as_unicode())
        ean_xpath = '//a[@data-ean]/@data-ean'
        brand_alt_xpath = "//meta[@property='product:brand']/@content"
        product = list(
            extruct_helper.get_products_microdata_extruct(
                items, response, category))
        if len(product) != 1:
            request = self._retry(response.request)
            yield request
            return

        product_dict = product[0]
        product = product_dict['product']

        if not product['ProductManufacturer']:
            product['ProductManufacturer'] = self.extract_xpath(
                response, brand_alt_xpath)

        yield product
        for product_id in product_dict['product_ids']:
            yield product_id

        ean_value = int(self.extract_xpath(response, ean_xpath))
        if ean_value:
            ean = self.product_id(product, kind='EAN', value=ean_value)
            yield ean

        first_page_review_xpath = "//ul[contains(@class, 'js-product-reviews-first')]/@data-href"
        next_page_review_xpath = "//ul[contains(@class, 'js-product-reviews-remaining')]/@data-href"
        reviews_per_page_xpath = "//ul[contains(@class, 'js-product-reviews-remaining')]/@data-paged-per-page"
        total_reviews_xpath = "//ul[contains(@class, 'js-product-reviews-remaining')]/@data-paged-all"
        initial_index_xpath = "//ul[contains(@class, 'js-product-reviews-remaining')]/@data-paged-current-index"
        paging_parameter_xpath = "//ul[contains(@class, 'js-product-reviews-remaining')]/@data-paged-url-param"

        first_page_review_url = self.extract_xpath(response,
                                                   first_page_review_xpath)
        if first_page_review_url:
            first_page_review_url = get_full_url(response,
                                                 first_page_review_url)
            first_page_review_url = set_query_parameter(
                first_page_review_url, 'sorting', 'LATEST')

            next_page_review_url = self.extract_xpath(response,
                                                      next_page_review_xpath)

            paging_meta = {}
            if next_page_review_url:
                last_review_db = get_latest_user_review_date_by_sii(
                    self.mysql_manager, self.spider_conf['source_id'],
                    product['source_internal_id'])
                next_page_review_url = get_full_url(response,
                                                    next_page_review_url)
                next_page_review_url = set_query_parameter(
                    next_page_review_url, 'sorting', 'LATEST')

                reviews_per_page = self.extract_xpath(response,
                                                      reviews_per_page_xpath)
                total_reviews = self.extract_xpath(response,
                                                   total_reviews_xpath)
                current_index = self.extract_xpath(response,
                                                   initial_index_xpath)
                paging_parameter = self.extract_xpath(response,
                                                      paging_parameter_xpath)
                paging_meta = {
                    'next_page_review_url': next_page_review_url,
                    'reviews_per_page': int(reviews_per_page),
                    'total_reviews': int(total_reviews),
                    'current_index': int(current_index),
                    'paging_parameter': paging_parameter,
                    'last_review_db': last_review_db
                }

            meta = {'product': product}
            headers = {
                'Referer': response.url,
                'X-Requested-With': 'XMLHttpRequest'
            }
            meta.update(paging_meta)

            request = Request(first_page_review_url,
                              meta=meta,
                              headers=headers,
                              callback=self.parse_reviews)
            yield request