def parse_product(self, response): review_url_xpath = "//div[@class='reviews']/p[@class='links ftr']/a/@href" _product = extruct_helper.product_items_from_microdata( response, response.meta['category']) if not _product: request = self._retry(response.request) yield request return product = _product.get('product') yield product product_ids = _product.get('product_ids') for product_id in product_ids: yield product_id review_url = self.extract(response.xpath(review_url_xpath)) if review_url: review_url = get_full_url(response, review_url) last_user_review = incremental_utils.get_latest_user_review_date_by_sii( self.mysql_manager, self.spider_conf['source_id'], product["source_internal_id"]) request = Request(url=review_url, callback=self.parse_review) request.meta['product'] = product request.meta['last_user_review'] = last_user_review yield request
def parse_product(self, response): items = extruct_helper.get_microdata_extruct_items(response.body_as_unicode()) category = response.meta['category'] product = list(extruct_helper.get_products_microdata_extruct(items, response, category)) if len(product) != 1: raise Exception("Could not extract product in %s" % response.url) product_dict = product[0] product = product_dict['product'] product['ProductManufacturer'] = self.extract(response.xpath("//meta[contains(@property, 'product:brand')]/@content")) yield product for product_id in product_dict['product_ids']: yield product_id bv_params = self.bv_base_params.copy() bv_params['bv_id'] = product['source_internal_id'] bv_params['offset'] = 0 review_url = self.get_review_url(**bv_params) request = Request(url=review_url, callback=self.parse_reviews) last_user_review = incremental_utils.get_latest_user_review_date_by_sii( self.mysql_manager, self.spider_conf['source_id'], product["source_internal_id"] ) request.meta['last_user_review'] = last_user_review request.meta['bv_id'] = product['source_internal_id'] request.meta['product'] = product request.meta['filter_other_sources'] = False yield request
def parse_category(self, response): next_page_url_xpath = "//*[@rel='next']/@href" products_xpath = "//div[contains(@class, 'item') and .//*[@class='stelline']]" product_id_xpath = ".//*[@name='sku']/@value" bv_id_xpath = ".//*[@class='stelline']/@id" products = response.xpath(products_xpath) if not products: sub_cat_xpath = "//div[@class='box_menu']//li/a/@href" sub_cat_urls = self.extract_list(response.xpath(sub_cat_xpath)) for url in sub_cat_urls: yield response.follow(url, callback=self.parse_category) return category = response.meta.get('category', {}) if not category: category_path_xpath = "//span[@class='path']/*[@itemprop='name']//text()" category = CategoryItem() category['category_url'] = response.url category['category_path'] = self.extract_all( response.xpath(category_path_xpath), separator=' | ') yield category if self.should_skip_category(category): return for product in products: source_internal_id = self.extract(product.xpath(product_id_xpath)) bv_id = product.xpath(bv_id_xpath).re_first(r'-([0-9]+)') if not (source_internal_id and bv_id): continue bv_params = self.bv_base_params.copy() bv_params['bv_id'] = bv_id bv_params['offset'] = 0 review_url = self.get_review_url(**bv_params) request = Request(url=review_url, callback=self.parse_reviews) last_user_review = incremental_utils.get_latest_user_review_date_by_sii( self.mysql_manager, self.spider_conf['source_id'], source_internal_id) request.meta['last_user_review'] = last_user_review request.meta['bv_id'] = bv_id request.meta['product_id'] = source_internal_id request.meta['OriginalCategoryName'] = category.get( 'category_path') request.meta['filter_other_sources'] = False yield request next_page_url = self.extract_xpath(response, next_page_url_xpath) if next_page_url: next_page_url = get_full_url(response, next_page_url) request = Request(next_page_url, callback=self.parse_category) request.meta['category'] = category yield request
def parse_review(self, response): next_page_xpath = "(//*[@rel='next']/@href)[1]" default_rating_xpath = './/reevoo-score/@data-score' product = response.meta['product'] reviews = response.xpath('//article[contains(@id,"review_")]') if not reviews: return # From observation, at least currys.co.uk uses a different format to present review rating rating_xpath = response.meta.get('rating_xpath', '') if not rating_xpath: rating_xpath = default_rating_xpath last_user_review = incremental_utils.get_latest_user_review_date_by_sii( self.mysql_manager, self.spider_conf['source_id'], product["source_internal_id"]) for review in reviews: user_review = ReviewItem() date = self.extract( review.xpath( './/span[contains(@class, "date_publish")]/text()')) if date: user_review['TestDateText'] = date_format(date, '') current_user_review = dateparser.parse( user_review['TestDateText'], date_formats=['%Y-%m-%d']) if current_user_review < last_user_review: return user_review['DBaseCategoryName'] = "USER" user_review['ProductName'] = product['ProductName'] user_review['TestUrl'] = product['TestUrl'] user_review['SourceTestRating'] = self.extract( review.xpath(rating_xpath)) user_review['Author'] = self.extract( review.xpath('.//h4[@class="attribution-name"]/text()')) user_review['TestPros'] = self.extract_all( review.xpath('.//dd[@class="pros"]/text()')) user_review['TestCons'] = self.extract_all( review.xpath('.//dd[@class="cons"]/text()')) user_review['source_internal_id'] = product['source_internal_id'] # All reviews after first empty review are empty if user_review['TestPros'] or user_review['TestCons']: yield user_review else: return next_page_url = self.extract(response.xpath(next_page_xpath)) if next_page_url: next_page_url = get_full_url(response, next_page_url) request = Request(next_page_url, callback=self.parse_review, meta=response.meta) yield request
def parse(self, response): product_data = self.extract(response.xpath("//script[@type='text/javascript']" "[contains(text(),'hof.data')]/text()")) product_id_re = r'"FriendlyProductId":"([0-9]+)"' current_page = response.meta.get('page_number') category = response.meta.get('category', '') original_url = response.meta.get('original_url', '') if not category: # We should be able to spot category name from the URL. # Otherwise we will need to parse category name from JavaScript, # as the site loads its product pages using knockout.js category = CategoryItem() category['category_path'] = response.url category['category_url'] = response.url yield category if self.should_skip_category(category): return current_page = 1 original_url = response.url product_ids = re.findall(product_id_re, product_data) if not product_ids: return for product_id in product_ids: bv_params = self.bv_base_params.copy() bv_params['bv_id'] = product_id bv_params['offset'] = 0 review_url = self.get_review_url(**bv_params) request = Request(url=review_url, callback=self.parse_reviews) last_user_review = incremental_utils.get_latest_user_review_date_by_sii( self.mysql_manager, self.spider_conf['source_id'], product_id ) request.meta['last_user_review'] = last_user_review request.meta['bv_id'] = product_id request.meta['product_id'] = product_id request.meta['OriginalCategoryName'] = category.get('category_path') request.meta['filter_other_sources'] = False yield request next_page = current_page + 1 next_page_url = original_url + '?page={}'.format(next_page) next_page_request = Request(url=next_page_url, callback=self.parse) next_page_request.meta['page_number'] = next_page next_page_request.meta['category'] = category next_page_request.meta['original_url'] = original_url yield next_page_request
def parse_product(self, response): product_xpaths = { "PicURL": "//meta[@property='og:image']/@content", "ProductName": "//h1[@class='productHeading']//text()", "ProductManufacturer": "//h1[@class='productHeading']/text()" } product = self.init_item_by_xpaths(response, "product", product_xpaths) match = re.search(self.source_internal_id_re, response.url) if match: product['source_internal_id'] = match.group(1) product['TestUrl'] = response.url product["OriginalCategoryName"] = response.meta["category"][ "category_path"] yield product mpn_value = self.extract( response.xpath("//span[@id='productMPN']/text()")) if mpn_value: mpn = ProductIdItem() mpn['source_internal_id'] = product["source_internal_id"] mpn['ProductName'] = product["ProductName"] mpn['ID_kind'] = "MPN" mpn['ID_value'] = mpn_value yield mpn ean_value = self.extract( response.xpath("//span[@id='productEAN']/text()")) if ean_value: ean = ProductIdItem() ean['source_internal_id'] = product["source_internal_id"] ean['ProductName'] = product["ProductName"] ean['ID_kind'] = "EAN" ean['ID_value'] = ean_value yield ean bv_params = self.bv_base_params.copy() bv_params['bv_id'] = product['source_internal_id'] bv_params['offset'] = 0 review_url = self.get_review_url(**bv_params) request = Request(url=review_url, callback=self.parse_reviews) last_user_review = incremental_utils.get_latest_user_review_date_by_sii( self.mysql_manager, self.spider_conf['source_id'], product["source_internal_id"]) request.meta['last_user_review'] = last_user_review request.meta['bv_id'] = product['source_internal_id'] request.meta['product'] = product request.meta['filter_other_sources'] = False yield request
def parse_reviews(self, response): product = response.meta['product'] rating_xpath = ".//*[@class='review--header-rating']/text()" title_xpath = ".//h3[contains(@class, 'review--header-title')]/text()" summary_xpath = ".//div[contains(@class, 'review--description')]//text()" header_xpath = ".//div[@class='review--header-review-info']//text()" date_xpath = ".//div[@class='review--header-review-info']/time/@datetime" pros_xpath = ".//li[contains(@class, 'pros-and-cons-pro')]//*[@class!='is-visually-hidden']/text()" cons_xpath = ".//li[contains(@class, 'pros-and-cons-con')]//*[@class!='is-visually-hidden']/text()" next_page_xpath = "//a[@rel='next']/@href" reviews = response.xpath("//li[contains(@class, 'reviews__list-item')]") last_user_review = incremental_utils.get_latest_user_review_date_by_sii( self.mysql_manager, self.spider_conf['source_id'], product["source_internal_id"] ) for review in reviews: date = self.extract_xpath(review, date_xpath) if date: date = date_format(date, '') current_user_review = dateparser.parse(date, date_formats=['%Y-%m-%d']) if current_user_review < last_user_review: return title = self.extract_xpath(review, title_xpath) rating = self.extract_xpath(review, rating_xpath) splitted = rating.split('/') if splitted: rating = splitted[0] summary = self.extract_all_xpath(review, summary_xpath) pros = self.extract_all_xpath(review, pros_xpath, separator=' ; ') cons = self.extract_all_xpath(review, cons_xpath, separator=' ; ') author = '' header = self.extract_all_xpath(review, header_xpath) if header: author = header.split('|') author = strip(author[0]) user_review = ReviewItem.from_product(product=product, tp='USER', rating=rating, title=title, date=date, summary=summary, pros=pros, cons=cons, author=author, scale=10) yield user_review next_page_url = self.extract_xpath(response, next_page_xpath) if next_page_url: next_page_url = get_full_url(response, next_page_url) request = Request(next_page_url, callback=self.parse_reviews) request.meta['product'] = product yield request
def parse_product(self, response): source_internal_id_re = r'(\d+)' _product = extruct_helper.product_items_from_microdata( response, response.meta['category']) if not _product: request = self._retry(response.request) yield request return product = _product.get('product') product_ids = _product.get('product_ids', []) if not (product and product_ids): self.logger.info("Could not scrape product at %s" % response.url) return # unfortunately, we need to clean up the source_internal_id for this source match = re.search(source_internal_id_re, product['source_internal_id']) if match: product['source_internal_id'] = match.group(1) for product_id in product_ids: product_id['source_internal_id'] = product['source_internal_id'] if product_id['ID_kind'] == 'sku': match = re.search(source_internal_id_re, product_id['ID_value']) if match: product_id['ID_value'] = match.group(1) yield product_id bv_id = product['source_internal_id'] if product['ProductName'] and product['source_internal_id'] and bv_id: yield product bv_params = self.bv_base_params.copy() bv_params['bv_id'] = bv_id bv_params['offset'] = 0 review_url = self.get_review_url(**bv_params) request = Request(url=review_url, callback=self.parse_reviews) last_user_review = incremental_utils.get_latest_user_review_date_by_sii( self.mysql_manager, self.spider_conf['source_id'], product["source_internal_id"]) request.meta['last_user_review'] = last_user_review request.meta['filter_other_sources'] = False request.meta['bv_id'] = bv_id request.meta['product'] = product yield request else: self.logger.info("Could not scrape product at %s" % response.url)
def parse_user_reviews(self, response): # Featured reviews are always at the top. We cannot do incremental scraping before finishing # parsing all featured reviews, as there may be newer regular reviews following them. product = response.meta['product'] reviews_xpath = "//ul[@class='userlist']/li[@id]" user_review_content_xpaths = { "TestTitle": ".//p[@class='reviewTitle']/text()", "Author": ".//*[@class='reviewedBy']/a/text()", "SourceTestRating": ".//*[@class='ratingStarSmall']/text()" } review_summary_xpath = ".//div[@class='reviewText']/span[@class='smallContent']/text()" review_summary_xpath_part2 = ".//div[@class='reviewText']//span[@class='moreReview']/text()" next_page_xpath = "//li[@id='pagNext']/a/@href" last_user_review = incremental_utils.get_latest_user_review_date_by_sii( self.mysql_manager, self.spider_conf['source_id'], product["source_internal_id"]) review_selectors = response.xpath(reviews_xpath) for review_selector in review_selectors: is_featured_review = 'featured' in self.extract( review_selector.xpath('./@class')).lower() review = self.parse_review(response, product, user_review_content_xpaths, 'USER', review_selector) # incremental scraping if review.get('TestDateText', '') and not is_featured_review: current_user_review = dateparser.parse( review['TestDateText'], date_formats=['%Y-%m-%d']) if current_user_review < last_user_review: return # If we fail to get summary, then an exception will be thrown review['TestSummary'] = review_selector.xpath( review_summary_xpath).extract_first() summary_part2 = review_selector.xpath( review_summary_xpath_part2).extract_first() if summary_part2: review['TestSummary'] += summary_part2 review['TestSummary'] = review['TestSummary'].strip() yield review next_page_url = self.extract(response.xpath(next_page_xpath)) if next_page_url: next_page_url = get_full_url(response, next_page_url) next_page_request = Request(next_page_url, callback=self.parse_user_reviews, meta=response.meta) yield next_page_request
def parse_product(self, response): category = response.meta['category'] soup = BeautifulSoup(response.body, "lxml") item_id = response.url.split('/')[-1].strip() product = ProductItem() product['source_internal_id'] = item_id product['ProductName'] = soup.find('h1', { 'itemprop': 'name' }).text.strip() product['ProductManufacturer'] = soup.find('a', { 'id': 'WMItemBrandLnk' }).text.strip() if soup.find('a', {'id': 'WMItemBrandLnk'}) else '' product['OriginalCategoryName'] = category['category_path'] product['PicURL'] = soup.find( 'img', {'class': 'product-image'})['src'].strip() product['TestUrl'] = response.url yield product price = soup.find('div', {'itemprop': 'price'}) product_id = ProductIdItem() product_id['source_id'] = product['source_id'] product_id['ProductName'] = product['ProductName'] product_id['source_internal_id'] = product['source_internal_id'] if price: try: product_id['ID_kind'] = 'price' product_id['ID_value'] = format( round(float(''.join(price.text.replace('$', ''))), 2), ".2f").replace('.', ',') except: pass yield product_id latest_review_date = get_latest_user_review_date_by_sii( self.mysql_manager, self.spider_conf['source_id'], item_id) review_page = 1 reviews_link = reviews_link_pattern % (item_id, str(review_page)) request = Request(reviews_link, callback=self.parse_review) request.meta['ProductName'] = product['ProductName'] request.meta['item_id'] = item_id request.meta['review_page'] = review_page request.meta['latest_review_date'] = latest_review_date anchors = soup.find_all('a', {'class': 'js-product-anchor'}) for anchor in anchors: if 'reviews' in anchor.text: request.meta['max_idx'] = int( anchor.text.replace('reviews', '').strip()) break yield request
def parse_product(self, response): product_xpaths = {"PicURL": "(//*[@property='og:image'])[1]/@content", "ProductName": "//h1//text()", "OriginalCategoryName": "//li[contains(@class, 'item category')][last()]/a/text()", "ProductManufacturer": "//th[@class='col label' and text()='Brand']/" "following-sibling::*/text()" } product = self.init_item_by_xpaths(response, "product", product_xpaths) bv_config_data = self.extract(response.xpath("//script[@type='text/javascript']" "[contains(text(),'productId')]/text()")) if product.get('OriginalCategoryName', ''): category = CategoryItem() category_url = self.extract(response.xpath("//li[contains(@class, 'item category')][last()]/a/@href")) category['category_url'] = get_full_url(response, category_url) category['category_leaf'] = product['OriginalCategoryName'] category['category_path'] = category['category_leaf'] yield category match = re.search(self.source_internal_id_re, bv_config_data) if match: product["source_internal_id"] = match.group(1).upper() product_id = ProductIdItem() product_id['source_internal_id'] = product["source_internal_id"] product_id['ProductName'] = product["ProductName"] product_id['ID_kind'] = "richersounds_id" product_id['ID_value'] = product["source_internal_id"] yield product_id yield product bv_params = self.bv_base_params.copy() bv_params['bv_id'] = product['source_internal_id'] bv_params['offset'] = 0 review_url = self.get_review_url(**bv_params) request = Request(url=review_url, callback=self.parse_reviews) last_user_review = incremental_utils.get_latest_user_review_date_by_sii( self.mysql_manager, self.spider_conf['source_id'], product["source_internal_id"] ) request.meta['last_user_review'] = last_user_review request.meta['bv_id'] = product['source_internal_id'] request.meta['product'] = product request.meta['filter_other_sources'] = False yield request
def parse_product(self, response): product_name_xpath = "//h1[@itemprop='name']/text()" product_id_xpath = "//div[@class='productid']/text()" manufacturer_xpath = "//input[@id='productManufacturerName']/@value" bv_id_xpath = "//input[@id='product_ID']/@value" product = ProductItem() product['TestUrl'] = response.url product['OriginalCategoryName'] = response.meta['OriginalCategoryName'] product['source_internal_id'] = self.extract( response.xpath(product_id_xpath)) product['ProductName'] = self.extract( response.xpath(product_name_xpath)) product['ProductManufacturer'] = self.extract( response.xpath(manufacturer_xpath)) bv_id = self.extract(response.xpath(bv_id_xpath)) if product['ProductName'] and product['source_internal_id'] and bv_id: yield product product_id = self.product_id(product) product_id['ID_kind'] = "boots_com_id" product_id['ID_value'] = product['source_internal_id'] yield product_id bv_params = self.bv_base_params.copy() bv_params['bv_id'] = bv_id bv_params['offset'] = 0 review_url = self.get_review_url(**bv_params) request = Request(url=review_url, callback=self.parse_reviews) last_user_review = incremental_utils.get_latest_user_review_date_by_sii( self.mysql_manager, self.spider_conf['source_id'], product["source_internal_id"]) request.meta['last_user_review'] = last_user_review request.meta['filter_other_sources'] = False request.meta['bv_id'] = bv_id request.meta['product'] = product yield request else: self.logger.info("Could not scrape product at %s" % response.url)
def parse_reviews(self, response): next_page_xpath = '//a[@class="next-arrow"]/@href' product = response.meta['product'] last_user_review = incremental_utils.get_latest_user_review_date_by_sii( self.mysql_manager, self.spider_conf['source_id'], product["source_internal_id"] ) reviews = response.xpath('//div[contains(@class,"reviewWidget")]') for review in reviews: user_review = ReviewItem() date = self.extract(review.xpath('.//span[@class="reviewDate"]/text()')) if date: user_review['TestDateText'] = date_format(date, '') current_user_review = dateparser.parse(user_review['TestDateText'], date_formats=['%Y-%m-%d']) if current_user_review < last_user_review: return user_review['DBaseCategoryName'] = "USER" user_review['SourceTestScale'] = 5 user_review['ProductName'] = product['ProductName'] user_review['TestUrl'] = product['TestUrl'] user_review['source_internal_id'] = product['source_internal_id'] rating = self.extract(review.xpath('.//span[contains(@class,"ratingSpriteUnder")]/@class')) rating = rating.strip('ratingSpriteUnder ratingSprite_').replace('-', '.') user_review['SourceTestRating'] = rating user_review['Author'] = self.extract(review.xpath('.//p[@class="name"]/text()')) user_review['TestTitle'] = self.extract(review.xpath('.//h2/text()')) user_review['TestSummary'] = self.extract_all(review.xpath('.//div[@class="reviewContainer"]/p/text()')) user_review['TestPros'] = self.extract_all(review.xpath('.//ul[@class="pros"]/li/text()'), '; ') user_review['TestCons'] = self.extract_all(review.xpath('.//ul[@class="cons"]/li/text()'), '; ') yield user_review next_page_url = self.extract_xpath(response, next_page_xpath) if next_page_url: next_page_url = get_full_url(response, next_page_url) request = Request(next_page_url, callback=self.parse_reviews) request.meta['product'] = product yield request
def parse_product(self, response): sii_re = '-([^\-]+).html' product = ProductItem() product['TestUrl'] = response.url.split('#')[0] product['OriginalCategoryName'] = response.meta['category'][ 'category_path'] product['ProductName'] = self.extract(response.xpath('//h1/text()')) product['PicURL'] = self.extract( response.xpath('//a[@itemprop="image"]/@href')) product['ProductManufacturer'] = self.extract( response.xpath('//span[@itemprop="brand"]/a/span/text()')) match = re.search(sii_re, response.url) if not match: return source_internal_id = match.group(1) product['source_internal_id'] = source_internal_id yield product review_xpath = "//ul[@class='pagNum']/@data-action" total_page_xpath = "//ul[@class='pagNum']/li[@class='next']/preceding-sibling::li[1]/text()" review_url = self.extract_xpath(response, review_xpath) total_pages = self.extract_xpath(response, total_page_xpath) if not total_pages: total_pages = 1 latest_db_date = get_latest_user_review_date_by_sii( self.mysql_manager, self.spider_conf["source_id"], source_internal_id) if review_url: set_query_parameter(review_url, 'ReviewOrdering', '2') review_url = get_full_url(response, review_url) request = Request(url=review_url, callback=self.parse_reviews) request.meta['product'] = product request.meta['current_page'] = 1 if total_pages: request.meta['total_pages'] = total_pages request.meta['latest_db_date'] = latest_db_date yield request
def parse(self, response): #Product product_xpaths = { "ProductName": "//h1[contains(@class,'title')]/span[@itemprop='name']/text()", } product = self.init_item_by_xpaths(response, "product", product_xpaths) product['source_internal_id'] = self.extract( response.xpath("//*[@itemprop='sku']/text()")) #Category category_leaf_xpath = "(//a[contains(@itemprop,'url')]/span[contains(@itemprop,'title')])[last()]/text()" category_path_xpath = "(//a[contains(@itemprop,'url')]/span[contains(@itemprop,'title')])/text()" category = CategoryItem() category['category_leaf'] = self.extract( response.xpath(category_leaf_xpath)) category['category_path'] = self.extract_all( response.xpath(category_path_xpath), ' | ') #product's OriginalCategoryName should always match category_path of the corresponding category item product['OriginalCategoryName'] = category['category_path'] yield product yield category #Review bv_params = self.bv_base_params.copy() bv_params['bv_id'] = product['source_internal_id'] bv_params['offset'] = 0 review_url = self.get_review_url(**bv_params) request = Request(url=review_url, callback=self.parse_reviews) last_user_review = incremental_utils.get_latest_user_review_date_by_sii( self.mysql_manager, self.spider_conf['source_id'], product["source_internal_id"]) request.meta['last_user_review'] = last_user_review request.meta['bv_id'] = product['source_internal_id'] request.meta['product'] = product request.meta['filter_other_sources'] = False yield request
def parse_product(self, response): product = ProductItem() product['TestUrl'] = response.url product[ 'OriginalCategoryName'] = 'Cell phones' #self.extract_all(response.xpath('//ol[@id="breadcrumb-list"]/li/a/text()'),"->") productname_xpath = '//h1/span[@itemprop="name"]//text()' picurl_xpath = '//img[@id="Image1x"]//@src' manufactor_xpath = '//h1/span[@itemprop="brand"]//text()' ean_xpath = '//meta[@itemprop="gtin13"]/@content' product['ProductName'] = self.extract( response.xpath(productname_xpath)) product['PicURL'] = get_full_url( response, self.extract(response.xpath(picurl_xpath))) product['ProductManufacturer'] = self.extract( response.xpath(manufactor_xpath)) product['source_internal_id'] = self.extract(response.xpath(ean_xpath)) yield product bv_params = self.bv_base_params.copy() bv_params['bv_id'] = product['source_internal_id'] bv_params['offset'] = 0 review_url = self.get_review_url(**bv_params) request = Request(url=review_url, callback=self.parse_reviews) last_user_review = incremental_utils.get_latest_user_review_date_by_sii( self.mysql_manager, self.spider_conf['source_id'], product["source_internal_id"]) request.meta['last_user_review'] = last_user_review request.meta['bv_id'] = product['source_internal_id'] request.meta['product'] = product request.meta['extra_parser'] = self.final_review_parser request.meta['filter_other_sources'] = False yield request
def call_review(self, response, product=None, incremental=True): bv_id = response.meta.get('bv_id', None) if not bv_id: bv_id_xpath = "//div/@data-product-id" bv_id = self.extract_xpath(response, bv_id_xpath) if incremental: last_user_review = incremental_utils.get_latest_user_review_date_by_sii( self.mysql_manager, self.spider_conf['source_id'], product["source_internal_id"]) response.meta['last_user_review'] = last_user_review bv_params = self.bv_base_params.copy() bv_params['bv_id'] = bv_id bv_params['offset'] = 0 fullUrl = self.FULL_URL_PATTERN.format(**bv_params) response.meta['product'] = product response.meta['bv_id'] = bv_id request = Request(fullUrl, callback=self.parse_reviews, meta=response.meta) yield request
def parse_category(self, response): products_xpath = "//div[@class='Productlist']/div" product_sku_xpath = "./@data-sku" has_review_xpath = ".//span[@class='Rating-average']" next_page_xpath = "(//*[@rel='next'])[1]/@href" category = response.meta.get('category', '') if not category: # the category we get here is actually parent category category_json_ld = extruct_helper.extract_json_ld( response.text, 'BreadcrumbList') if not category_json_ld: request = self._retry(response.request) yield request return category = extruct_helper.category_item_from_breadcrumbs_json_ld( category_json_ld) current_category_name = self.extract( response.xpath( "//div[@id='breadcrumb']/ul/li[@class='pad-left']/text()")) if current_category_name.lower( ) != category['category_leaf'].lower(): category['category_leaf'] = current_category_name category['category_path'] = u'{} | {}'.format( category['category_path'], current_category_name) category['category_url'] = response.url yield category if self.should_skip_category(category): return products = response.xpath(products_xpath) # Not a leaf category page if not products: return # We skip the product page, as feelunique.com tries to block us if we access too many of their pages, # but it is impossible for them to block the access to Bazaarvoice API for product in products: has_review = product.xpath(has_review_xpath) if not has_review: continue product_sku = self.extract(product.xpath(product_sku_xpath)) if product_sku: product_id = ProductIdItem() product_id['source_internal_id'] = product_sku product_id['ID_kind'] = 'feelunique_internal_id' product_id['ID_value'] = product_sku yield product_id bv_params = self.bv_base_params.copy() bv_params['bv_id'] = product_sku bv_params['offset'] = 0 review_url = self.get_review_url(**bv_params) last_user_review = incremental_utils.get_latest_user_review_date_by_sii( self.mysql_manager, self.spider_conf['source_id'], product_sku) request = Request(review_url, callback=self.parse_reviews) request.meta['last_user_review'] = last_user_review request.meta['filter_other_sources'] = False request.meta['OriginalCategoryName'] = category[ 'category_path'] request.meta['bv_id'] = product_sku yield request else: product_url_xpath = "./a/@href" product_url = self.extract(response.xpath(product_url_xpath)) product_url = get_full_url(response, product_url) self.logger.info("Failed to get SKU for product at %s" % product_url) next_page_url = self.extract_xpath(response, next_page_xpath) if next_page_url: next_page_url = get_full_url(response, next_page_url) next_page_request = Request(next_page_url, callback=self.parse_category) next_page_request.meta['category'] = category yield next_page_request
def parse_product(self, response): # sku # -------------------------------------------------------------------- sku_xpath = "//meta[@name='Product-Article-Number']/@content" sku = self.extract_xpath(response, sku_xpath) if not sku: # not a product page return # set up product item # -------------------------------------------------------------------- product_xpaths = { "ProductName": "//meta[@name='productName']/@content", "OriginalCategoryName": "//meta[@name='Product-Sub-Category']/@content", } product = self.init_item_by_xpaths(response, "product", product_xpaths) product['source_internal_id'] = sku # all products are from canon.nl product['ProductManufacturer'] = 'Canon' # product picture # -------------------------------------------------------------------- pic_url_xpath = "//meta[@name='Product-Image-Large']/@content" pic_url_xpath_alt = "//meta[@name='Product-Image-Small']/@content" # only got a relative url from Xptah pic_url = self.extract(response.xpath(pic_url_xpath)) if not pic_url: pic_url = self.extract(response.xpath(pic_url_xpath_alt)) print('got pic_url') product['PicURL'] = get_full_url(response, pic_url) # double check product name # -------------------------------------------------------------------- product_name = product.get('ProductName') if not product_name: product_name_xpath = "//meta[@name='og:title']/@content" product_name = self.extract(response.xpath(product_name_xpath)) if product_name: product['ProductName'] = product_name # double check OriginalCategoryName # -------------------------------------------------------------------- original_category = product.get('OriginalCategoryName') if not original_category: ''' a typical canon.nl product page has category names in their url for example: https://www.canon.nl/for_home/product_finder/printers/laser/i-sensys_lbp7750cdn/ where the last item seperated by '/' is the actual product and the two items before the last two are helpful for our category matching ''' sep = '|' start_category_ind = -4 end_category_ind = -2 all_category_names = sep.join( response.url.split('/')[start_category_ind:end_category_ind]) product['OriginalCategoryName'] = all_category_names # set up category item # -------------------------------------------------------------------- if product.get('OriginalCategoryName', ''): category = CategoryItem() category['category_path'] = product['OriginalCategoryName'] yield category # set up product_id item # -------------------------------------------------------------------- product_id = ProductIdItem() product_id['source_internal_id'] = product['source_internal_id'] product_id['ProductName'] = product['ProductName'] product_id['ID_kind'] = 'canon_id' product_id['ID_value'] = product['source_internal_id'] yield product_id yield product # set up for bv review # -------------------------------------------------------------------- bv_params = self.bv_base_params.copy() bv_params['bv_id'] = product['source_internal_id'] bv_params['offset'] = 0 review_url = self.get_review_url(**bv_params) request = Request(url=review_url, callback=self.parse_reviews) last_user_review = incremental_utils.get_latest_user_review_date_by_sii( self.mysql_manager, self.spider_conf['source_id'], product["source_internal_id"]) request.meta['last_user_review'] = last_user_review request.meta['bv_id'] = product['source_internal_id'] request.meta['product'] = product request.meta['filter_other_sources'] = False yield request
def parse_product(self, response): mobile_xpath = "//*[@id='mobile_content_bar']" mobile = response.xpath(mobile_xpath) canonical_url_xpath = "//link[@rel='canonical']/@href" if mobile: canonical_url = self.extract(response.xpath(canonical_url_xpath)) request = Request(url=canonical_url, callback=self.parse_product) request.meta['category'] = response.meta['category'] request.meta['review_url'] = response.meta['review_url'] yield request return pic_url_xpath = '//img[contains(@class,"s7carousel-main-image-slide-vertical")][1]/@src' product_name_xpath = "//div[@class='product-name']//span[@itemprop='name']/text()" product_name_alt_xpath = "//div[@id='pdpProduct']/h1/text()" product_id_xpath = "//*[@itemprop='sku']/text()" #product_id_alt_xpath = "//span[contains(@class,'partnumber')]/text()" manufacturer_xpath = "//*[@itemprop='brand']/text()" product = ProductItem() product['TestUrl'] = response.url product['OriginalCategoryName'] = response.meta['OriginalCategoryName'] product['source_internal_id'] = self.extract( response.xpath(product_id_xpath)) product['ProductName'] = self.extract( response.xpath(product_name_xpath)) product['ProductManufacturer'] = self.extract( response.xpath(manufacturer_xpath)) product['PicURL'] = self.extract(response.xpath(pic_url_xpath)) if not product['ProductName']: product['ProductName'] = self.extract( response.xpath(product_name_alt_xpath)) # the id from alt_xpath is different from the default xpath, do not use it # if not product['source_internal_id']: # product['source_internal_id'] = self.extract(response.xpath(product_id_alt_xpath)) bv_id = '' bv_id_re = r'product/([0-9]+)$' bv_id_match = re.search(bv_id_re, response.url, re.I) if bv_id_match: bv_id = bv_id_match.group(1) if product['ProductName'] and product['source_internal_id'] and bv_id: yield product product_id = self.product_id(product) product_id['ID_kind'] = "argos_uk_id" product_id['ID_value'] = product['source_internal_id'] yield product_id review_url = self.get_review_url(bv_id=bv_id, offset=0) request = Request(url=review_url, callback=self.parse_reviews) last_user_review = incremental_utils.get_latest_user_review_date_by_sii( self.mysql_manager, self.spider_conf['source_id'], product["source_internal_id"]) request.meta['last_user_review'] = last_user_review request.meta['filter_other_sources'] = False request.meta['bv_id'] = bv_id request.meta['product'] = product yield request else: self.logger.info("Could not scrape product at %s" % response.url)
def parse_reviews(self, response): jstree = js2xml.parse(response.body) xml = js2xml.pretty_print(jstree) html_xpath = "//var[@name='materials']/object/property[@name='BVRRSourceID']/string/text()" html = jstree.xpath(html_xpath) if html: selector = Selector(text=html[0]) next_page_xpath = '(//*[contains(@class,"BVRRNextPage")])[1]/a/@data-bvjsref' review_list_xpath = '//*[contains(@class,"BVRRContentReview")]' from_product_url_xpath = ".//div[contains(@class, 'BVDI_SUAttribution')]//a[@class='BVDILink']/@href" from_another_source_xpath = ".//*[contains(@class,'BVRRSyndicatedContentAttribution')]" filter_other_sources = response.meta.get('filter_other_sources', None) extra_review_parser = response.meta.get('extra_review_parser', None) last_user_review = response.meta.get('last_user_review', None) product = response.meta['product'] if not product["source_internal_id"]: raise Exception("BV Product without source_internal_id") if not last_user_review: last_user_review = incremental_utils.get_latest_user_review_date_by_sii( self.mysql_manager, self.source_id, product["source_internal_id"]) review_list = selector.xpath(review_list_xpath) if not review_list: return for review_selector in review_list: skip_review = False if filter_other_sources: skip_review = review_selector.xpath(from_another_source_xpath) from_product_url = self.extract_xpath(review_selector, from_product_url_xpath) from_product = True if from_product_url: from_product = (product["source_internal_id"].lower() in from_product_url.lower()) review = self._parse_review(product, review_selector, extra_review_parser) if last_user_review: current_user_review = datetime.strptime( review['TestDateText'], '%Y-%m-%d') if last_user_review > current_user_review: return if from_product and not skip_review: yield review next_page_url = self.extract_xpath(selector, next_page_xpath) if next_page_url: headers = response.request.headers request = Request(next_page_url, callback=self.parse_reviews, headers=headers) request.meta['product'] = product request.meta['last_user_review'] = last_user_review request.meta['filter_other_sources'] = filter_other_sources request.meta['extra_review_parser'] = extra_review_parser yield request
def parse_product(self, response): category = response.meta['category'] items = extruct_helper.get_microdata_extruct_items( response.body_as_unicode()) ean_xpath = '//a[@data-ean]/@data-ean' brand_alt_xpath = "//meta[@property='product:brand']/@content" product = list( extruct_helper.get_products_microdata_extruct( items, response, category)) if len(product) != 1: request = self._retry(response.request) yield request return product_dict = product[0] product = product_dict['product'] if not product['ProductManufacturer']: product['ProductManufacturer'] = self.extract_xpath( response, brand_alt_xpath) yield product for product_id in product_dict['product_ids']: yield product_id ean_value = int(self.extract_xpath(response, ean_xpath)) if ean_value: ean = self.product_id(product, kind='EAN', value=ean_value) yield ean first_page_review_xpath = "//ul[contains(@class, 'js-product-reviews-first')]/@data-href" next_page_review_xpath = "//ul[contains(@class, 'js-product-reviews-remaining')]/@data-href" reviews_per_page_xpath = "//ul[contains(@class, 'js-product-reviews-remaining')]/@data-paged-per-page" total_reviews_xpath = "//ul[contains(@class, 'js-product-reviews-remaining')]/@data-paged-all" initial_index_xpath = "//ul[contains(@class, 'js-product-reviews-remaining')]/@data-paged-current-index" paging_parameter_xpath = "//ul[contains(@class, 'js-product-reviews-remaining')]/@data-paged-url-param" first_page_review_url = self.extract_xpath(response, first_page_review_xpath) if first_page_review_url: first_page_review_url = get_full_url(response, first_page_review_url) first_page_review_url = set_query_parameter( first_page_review_url, 'sorting', 'LATEST') next_page_review_url = self.extract_xpath(response, next_page_review_xpath) paging_meta = {} if next_page_review_url: last_review_db = get_latest_user_review_date_by_sii( self.mysql_manager, self.spider_conf['source_id'], product['source_internal_id']) next_page_review_url = get_full_url(response, next_page_review_url) next_page_review_url = set_query_parameter( next_page_review_url, 'sorting', 'LATEST') reviews_per_page = self.extract_xpath(response, reviews_per_page_xpath) total_reviews = self.extract_xpath(response, total_reviews_xpath) current_index = self.extract_xpath(response, initial_index_xpath) paging_parameter = self.extract_xpath(response, paging_parameter_xpath) paging_meta = { 'next_page_review_url': next_page_review_url, 'reviews_per_page': int(reviews_per_page), 'total_reviews': int(total_reviews), 'current_index': int(current_index), 'paging_parameter': paging_parameter, 'last_review_db': last_review_db } meta = {'product': product} headers = { 'Referer': response.url, 'X-Requested-With': 'XMLHttpRequest' } meta.update(paging_meta) request = Request(first_page_review_url, meta=meta, headers=headers, callback=self.parse_reviews) yield request