def _populate_from_html(self, response, product): cond_set(product, 'image_url', response.css('[itemprop=image]::attr(src)').extract(), lambda url: urlparse.urljoin(response.url, url)) _populate_from_open_graph_product(response, product) cond_set(product, 'price', response.css('.currentPrice ins::text').extract(), unicode.strip) cond_set(product, 'brand', response.css('[itemprop=brand]::text').extract()) if not product.get('brand', None): dump_url_to_file(response.url) cond_set(product, 'title', response.css('[itemprop=name]::text').extract()) css = '#longDesc article' desc = response.css(css).extract() desc = desc[0] if desc else None cond_set_value(product, 'description', desc) reseller_id_regex = "(\d+)-pdt" reseller_id = re.findall(reseller_id_regex, response.url) reseller_id = reseller_id[0] if reseller_id else None cond_set_value(product, 'reseller_id', reseller_id) self._unify_price(product)
def _scrape_product_links(self, response): for box in self._fetch_product_boxes(response): url = urlparse.urljoin(response.url, self._link_from_box(box)) product = SiteProductItem() self._populate_from_box(response, box, product) if not product.get('brand', None): dump_url_to_file(response.url) meta = response.meta.copy() meta['product'] = product user_agent = USER_AGENT_LIST.pop(0) USER_AGENT_LIST.append(user_agent) request = Request(url, callback=self.parse_product, meta=meta) request.headers.setdefault('User-Agent', user_agent) yield request, product
def parse_product(self, response): product = response.meta['product'] cond_set(product, 'title', response.css('#pdpProduct h1::text').extract(), lambda s: string.strip(s, ' \n')) if not product.get('brand', None): brand = guess_brand_from_first_words( product.get('title').strip() if product.get('title') else '') if brand: product['brand'] = brand if not product.get('brand', None): dump_url_to_file(response.url) if product.get('price') is None: currency = response.css('.currency::text').extract() currency = currency[0] if currency else '' price = response.css('.actualprice .price::text').re('\d+') price = price[0] if price else '' cond_set_value(product, 'price', currency + price) if not u'£' in product.get('price', ''): self.log('Invalid price at: %s' % response.url, level=ERROR) else: product['price'] = Price(price=product['price'].replace( u'£', '').strip(), priceCurrency='GBP') cond_set(product, 'image_url', response.css('#mainimage.photo::attr(src)').extract(), lambda url: urlparse.urljoin(response.url, url)) cond_set(product, 'description', response.css('.fullDetails').extract(), _inner_html) cond_set(product, 'is_out_of_stock', response.css('#globalDeliveryGrey[style="display:block;"]'), bool) reseller_id = re.findall(r'partNumber/(\d+)', response.url) cond_set(product, 'reseller_id', reseller_id[0] if reseller_id else None) # Hardcoded cond_set_value(product, 'locale', 'en-GB') cond_set( product, 'model', response.xpath('//div[@class="fullDetails"]/ul/li/text()').re( 'EAN:\s(.*).')) if self.fetch_related_products: return self._request_related_products(response) else: return product
def _populate_from_html(self, response, product): self._populate_hardcoded_fields(product) cond_set(product, 'title', response.css('#itemTitle::text').extract()) cond_set( product, 'price', response.css('[itemprop=price]::text , ' '#mm-saleDscPrc::text').extract(), self._unify_price) seller = response.xpath('//div[@class="mbg"]/a/span/text()').extract() if seller: seller = seller[0].strip() product["marketplace"] = [{ "name": seller, "price": product.get("price", None) }] cond_replace(product, 'image_url', response.css('[itemprop=image]::attr(src)').extract()) xpath = '//*[@id="vi-desc-maincntr"]/node()[normalize-space()]' cond_set_value(product, 'description', response.xpath(xpath).extract(), ''.join) cond_replace(product, 'url', response.css('[rel=canonical]::attr(href)').extract()) xpath = '//td[@class="attrLabels" and contains(text(), "Brand:")]' \ '/following-sibling::td/span/text()' cond_set(product, 'brand', response.xpath(xpath).extract()) if not product.get('brand', None): dump_url_to_file(response.url) xpath = '//td[@class="attrLabels" and contains(text(), "Model:")]' \ '/following-sibling::td/span/text()' cond_set(product, 'model', response.xpath(xpath).extract()) reseller_id_regex = "-\/([^\/&?\.\s]+)" reseller_id = re.findall(reseller_id_regex, response.url) reseller_id = reseller_id[0] if reseller_id else None cond_set_value(product, 'reseller_id', reseller_id)
def parse_product(self, response): prod = response.meta['product'] reviews = response.xpath('//div[@id="review_loading"]/' 'following::div[contains(@id, "review_")]') if reviews and len(reviews) > 0: total = len(reviews) stars = {1: 0, 2: 0, 3: 0, 4: 0, 5: 0} sum = 0 for review in reviews: stars_count = len(review.xpath('./i[@class="icon-star"]')) if stars_count == 0: total -= 1 continue stars[stars_count] += 1 sum += stars_count try: avg = float(sum) / float(total) except ZeroDivisionError: avg = float(0) prod['buyer_reviews'] = BuyerReviews(total, avg, stars) else: prod['buyer_reviews'] = ZERO_REVIEWS_VALUE title = response.xpath( '//h2[@class="product_name product_title"]/span[@itemprop="name"]/text()' ).extract() cond_set(prod, 'title', title) price = is_empty( re.findall( '(.?)(\d+.\d+)', is_empty( response.xpath( '//div[@class="price"]/text() |' ' //div[@class="price"]/div/text()').extract(), "")), 0) if price: priceCurrency = self.convert_currency[is_empty(price, "")] prod["price"] = Price(priceCurrency=priceCurrency, price=price[1]) des = response.xpath( '//div[@class="clearfix text_box margin_after bg_white"]' '| //div[@class="wide_page"]').extract() if len(des) < 1: des = response.xpath( "//span[contains(@class,'product_smallprint')]").extract() cond_set(prod, 'description', des) img_url = response.xpath('//img[@id="product_image"]/@src').extract() cond_set(prod, 'image_url', img_url) cond_set(prod, 'locale', ['en-US']) cond_set(prod, 'brand', ['NO BRAND']) if not prod.get('brand', None): dump_url_to_file(response.url) prod['url'] = unicode(response.url) cond_set( prod, 'upc', response.xpath("//script[contains(text(),'window.product = ')]"). re(r"'id' : \"(\d+)\"")) items = response.xpath( '//a[contains(@class,"g-med")] | //a[contains(@class,"g-large")]') related = [] for item in items: name = item.xpath('.//img/@title').extract() link = item.xpath('.//@href').extract() if name and link: name = is_empty(name, "") link = (link, "") related.append(RelatedProduct(title=name, url=link)) prod['related_products'] = {'Similar Products': related} available = response.xpath( '//meta[@property="og:price:availability"]/@content').extract() if 'preorder' in available: prod['is_out_of_stock'] = True elif 'instock' in available: prod['is_out_of_stock'] = False return prod
def parse_product(self, response): product = response.meta['product'] cond_set( product, 'title', response.xpath("//section[@itemscope]/h1" "/span[@itemprop='name']/text()").extract()) cond_set( product, 'brand', response.xpath("//section[@itemscope]/h1" "/span[@itemprop='brand']/text()").extract()) if not product.get('brand', None): dump_url_to_file(response.url) cond_set( product, 'upc', response.xpath("//section[@itemscope]/meta[@itemprop='identifier']" "/@content").extract()) price = response.xpath( "//section[@itemscope]/div[contains(@class,'productDetail')]" "/section[contains(@class,'description')]" "/div/div[contains(@class,'productPrices')]" "/span[@itemprop='price']/ins/text()").re(FLOATING_POINT_RGEX) if price: product['price'] = Price(price=price[0], priceCurrency='GBP') cond_set( product, 'image_url', response.xpath( "//section[@itemscope]/descendant::section[@class='productMedias']" "/div[@id='currentView']/a/img/@src").extract()) regex = "(\d+)-pdt" reseller_id = re.findall(regex, response.url) reseller_id = reseller_id[0] if reseller_id else None cond_set_value(product, "reseller_id", reseller_id) if self.DO_DESCRIPTION: cond_set(product, 'description', response.xpath("//section[@id='longDesc']").extract()) cond_set_value(product, 'locale', "en-GB") out_of_stock = response.xpath( "//div[contains(@class,'productDetail')]" "/section[@class='col3']/div[@class='nested']" "/strong/text()").re(r"Out of stock") if out_of_stock: product['is_out_of_stock'] = True # review = response.xpath( # "//div[contains(@class,'productDetail')]" # "/section[@class='col3']/p[@id='reviews']" # "/a/@href" # ).extract() payload = self._extract_rr_parms(response) productid = payload['p'] product['upc'] = productid review_url = ( 'http://mark.reevoo.com/reevoomark/en-GB/product?sku={sku}' '&trkref=PCG').format(sku=productid) new_meta = response.meta.copy() new_meta['handle_httpstatus_list'] = [404] reevoo_request = Request(url=review_url, callback=self._parse_reevoo, meta=new_meta) response.meta['reevoo'] = reevoo_request if payload: new_meta = response.meta.copy() rr_url = urlparse.urljoin(self.SCRIPT_URL, "?" + urllib.urlencode(payload)) return Request(rr_url, self._parse_rr_json, meta=new_meta) else: self.log("No {rr} payload at %s" % response.url, DEBUG) return product
def parse_product(self, response): product = response.meta['product'] data = is_empty( re.findall("page_products\'\:\s+([^\}]*)", response.body_as_unicode())) + "}" try: data = json.loads(data.strip().replace("'", "\"")) except ValueError: data = {} product["description"] = is_empty( response.xpath( "//div[contains(@class, 'prd-description')]").extract()) average = is_empty( response.xpath( "//span[contains(@class, 'b-rating-average')]/text()").extract( )) total = is_empty( response.xpath("//h2[@class='b-ttl-2']/span/text()").re( FLOATING_POINT_RGEX)) if average and total: product["buyer_reviews"] = BuyerReviews(num_of_reviews=total, average_rating=average, rating_by_star={}) if data: product["price"] = Price(price=data["prod_price"], priceCurrency=data["currency"]) product["is_out_of_stock"] = not bool(int(data["stock_available"])) product["title"] = data["prod_name"] product["image_url"] = data["prod_image_url"] product["url"] = data["prod_url"] if not product["description"]: product["description"] = data["description"] product["brand"] = data["brand"] else: price = is_empty( response.xpath("//span[contains(@class, 'price')]/text()").re( FLOATING_POINT_RGEX), None) if price: product["price"] = Price(price=price, priceCurrency="GBP") product["title"] = is_empty( response.xpath( "//h1[contains(@class, 'b-ttl-main')]/text()").extract()) product["image_url"] = is_empty( response.xpath( "//*[@id='cart-form']/div[2]/div[1]/div/div/a/@href"). extract()) product["url"] = response.url product["brand"] = is_empty( response.xpath("//span[@itemprop='brand']/text()").extract()) if not product.get('brand', None): dump_url_to_file(response.url) cond_set_value(product, 'locale', "en-GB") if "You May Also Like" in response.body_as_unicode(): catId = is_empty( re.findall("cat_id\'\:\s+(\d+)", response.body_as_unicode())) sid = is_empty( re.findall("sid\'\:\s+\"([^\"]*)", response.body_as_unicode())) if catId and sid and "item_id" in data: url = "http://www.rakuten.co.uk/api/recommendation?" \ "category_id=%s" \ "&item_id=%s" \ "&shop_id=%s" % (catId, data["item_id"], sid) return Request(url=url, callback=self._related_parse, meta={"product": product}) return product
def parse_product(self, response): prod = response.meta['product'] prod['url'] = response.url prod['locale'] = 'en_GB' title = response.xpath('//h1[@class="product-title"]/text()').extract() if title: prod['title'] = title[0].strip() img = response.xpath('//img[@itemprop="image"]/@src').extract() if img: prod['image_url'] = urlparse.urljoin(response.url, img[0]) price = response.xpath('//span[@itemprop="price"]/text()').re( FLOATING_POINT_RGEX) if price: prod['price'] = Price(price=price[0], priceCurrency='GBP') description = response.xpath( '//div[@class="product-description"]').extract() if not description: description = response.xpath( '//ul[@itemprop="description"]').extract() if description: prod['description'] = description[0].strip() brand = response.xpath('//img[@itemprop="logo"]/@alt').extract() if brand: prod['brand'] = brand[0] if not prod.get('brand', None): dump_url_to_file(response.url) in_stock = response.xpath( '//p[@itemprop="availability"]/@content').extract() if in_stock: if in_stock[0] == 'in_stock': prod['is_out_of_stock'] = False else: prod['is_out_of_stock'] = True sku = response.xpath('//strong[@itemprop="sku"]/text()').extract() if sku: prod['model'] = sku[0] d = re.findall(r'window.ebuyer.config\s=\s(.*);', response.body_as_unicode()) if d: data = json.loads(d[0]) a = data['richRelevance']['apiKey'] p = data['product']['id'] s = data['sessionId'] pt = '|item_page.recs_1|item_page.recs_2' l = 1 get_dict = {'a': a, 'p': p, 's': s, 'pt': pt, 'l': l} converted_get = urllib.urlencode(get_dict) related_link = self.SCRIPT_URL + converted_get meta = response.meta.copy() meta['item_id'] = p yield Request(related_link, callback=self.get_recommended_id, meta=meta) yield prod
def parse_product(self, response): reviewed = response.meta.get('reviewed') prod = response.meta['product'] # if there was no any request for item review try to send it if not reviewed: revs_a = response.xpath('//a[@class="read_reviews_action"]') if revs_a: avg = revs_a.xpath( './/span[@itemprop="ratingValue"]/text()').extract() total = revs_a.xpath( './/span[@itemprop="ratingCount"]/text()').extract() rev_url = response.url + '/reviewhtml/all' meta = response.meta.copy() meta['avg'] = avg meta['total'] = total meta['initial_response'] = response return Request(rev_url, callback=self.populate_reviews, meta=meta) else: cond_set_value(prod, 'buyer_reviews', ZERO_REVIEWS_VALUE) title = response.xpath( '//div[@class="product-summary"]/h1/text()').extract() cond_set(prod, 'title', title) brand = [ is_empty(re.findall(r'"manufacturer":\s"(.*)",', response.body), None) ] if not brand: if prod.get("title"): brand = is_empty([guess_brand_from_first_words(prod['title'])], None) if brand: cond_set(prod, 'brand', brand) if not prod.get('brand', None): dump_url_to_file(response.url) price = response.xpath( '//p[@class="new-price"]/meta[@itemprop="price"]/@content' ).extract() priceCurrency = response.xpath( '//p[@class="new-price"]/meta[@itemprop="priceCurrency"]/@content' ).extract() if price and priceCurrency: if re.match("\d+(.\d+){0,1}", price[0]): prod["price"] = Price(priceCurrency=priceCurrency[0], price=price[0]) else: prod["price"] = Price(priceCurrency="GBP", price=0.00) else: prod["price"] = Price(priceCurrency="GBP", price=0.00) des = response.xpath('//div[@class="productDescription"]').extract() cond_set(prod, 'description', des) img_url = response.xpath( '//div[@class="product-images"]/img/@src').extract() cond_set(prod, 'image_url', img_url) cond_set(prod, 'locale', ['en-US']) if not prod.get("reseller_id"): reseller_id = response.xpath( './/*[@itemprop="sku"]/text()').extract() cond_set(prod, 'reseller_id', reseller_id) prod['url'] = response.url available = response.xpath( '//form[contains(@id,"addToCartForm")]/input[@type="submit"]/@value' ).extract() if available and 'Email when back in stock' in available[0]: cond_set(prod, 'is_out_of_stock', [True]) if available and 'Last few in store' in available[0]: lim = LimitedStock(is_limited=True, items_left=[1]) cond_set(prod, 'limited_stock', [lim]) prod_id = re.findall(r'"id":\s"(.*)",', response.body) if prod_id: recomm_url = self.RECOMM_URL.format(prod_id=prod_id[0]) return Request(recomm_url, callback=self.populate_recommendations, meta=response.meta.copy()) return prod
def parse_product(self, response): def full_url(url): return urlparse.urljoin(response.url, url) product = response.meta['product'] # case when we parse first response of product as usual if not response.meta.get('after_reviews'): cond_set(product, 'title', response.xpath( "//div[@class='description']/h1[@itemprop='name']/text()" ).extract()) cond_set(product, 'brand', response.xpath( "//div[@class='product-media-top']/" "img[@id='product_brand_img']/@alt" ).extract()) if not product.get('brand', None): dump_url_to_file(response.url) cond_set(product, 'image_url', response.xpath( "//div[@class='product-media-top']/noscript" "/a[@id='product_image_ref']/img/@src").extract()) price = response.xpath( "//p[@id='product_price']/span[@itemprop='price']" "/text()").re(FLOATING_POINT_RGEX) if price: product['price'] = Price( price=price[0], priceCurrency='GBP') cond_set(product, 'description', response.xpath( "//div[@id='product_details_container']" "/div[@class='description']" ).extract()) regex = "\/([a-z\d]+)(?:$|\?)" reseller_id = re.findall(regex, product.get('url', '')) reseller_id = reseller_id[0] if reseller_id else None cond_set_value(product, "reseller_id", reseller_id) stock_status = response.xpath( '//link[@itemprop="availability"]/@href' ).extract() if stock_status: if 'OutOfStock' in stock_status[0]: product['is_in_store_only'] = True else: product['is_in_store_only'] = False cond_set_value(product, 'locale', "en-GB") # try to extract some data for additional request for # recommendations jsessionid = response.xpath( '//input[@id="jsessionid_value_V1_MR_rr"]/@value' ).extract() product_id = response.xpath( '//input[@id="product_value_v1_th_rr"]/@value' ) product_id = product_id or response.css( '[itemprop=productID]::text' ) product_id = product_id.extract() product_id = product_id[0] if product_id else None # for reviews and model(may be another than for recommendations) prod_id = re.findall(r"'ecomm_prodid':\s'(.*)'", response.body) if prod_id: prod_id = prod_id[0].strip() product['model'] = prod_id if prod_id or product_id: # populate buyer reviews rev_url = self.REVS_BASE.format(prod_id=prod_id or product_id) meta = response.meta.copy() meta['jsessionid'] = jsessionid meta['product_id'] = product_id return Request(rev_url, callback=self.populate_buyer_reviews, meta=meta) else: self.log('Could not scrape buyer reviews ' '(product id could not be scraped)') # case when we use this function second time after populating # buyer reviews else: jsessionid = response.meta.get('jsessionid') product_id = response.meta.get('product_id') if jsessionid and product_id: scheme = 'V1_MR_rr' url = self.generate_related_url(jsessionid, product_id, scheme) return Request(url, callback=self.populate_related, meta={'product': product, 'jsessionid': jsessionid, 'product_id': product_id}, dont_filter=True) return product