def parse_product(self, response): loader = ProductLoader(item=Product(), response=response) loader.add_xpath('url', '//link[@rel="canonical"]/@href') loader.add_xpath('name', '//span[@id="productName"]//text()') loader.add_xpath('sku', '//span[@id="productEAN"]/text()[last()]') loader.add_xpath('category', '//div[@id="breadcrumb"]/ul/li[position()>1]/a/span/text()') loader.add_css('image_url', '.productImageItem ::attr(href)') brand = response.css('.brand ::text').extract_first() if brand != "null": loader.add_value('brand', brand) item = loader.load_item() p = re.compile('stockMatrix = (.+?);', re.DOTALL) data = response.xpath('//script/text()').re(p) options = json.loads(data[0]) for option in options: loader = ProductLoader(item=Product(), response=response) loader.add_value(None, item) opt_iter = iter(option) opt_name = '' for attribute in response.css('.skuAttribute'): opt_name = opt_iter.next() loader.add_value('name', opt_name) colour_url = response.xpath('//input[@class="colourImageUrl"][@name="%s"]/@value' %opt_name).extract_first() if colour_url: loader.replace_value('image_url', 'http://media.littlewoods.com/i/littlewoods/%s?$1064x1416_standard$' %colour_url) loader.replace_value('identifier', opt_iter.next()) stock = opt_iter.next() if stock.startswith('Unavailable'): continue loader.replace_value('stock', int('Out of stock' not in stock)) loader.replace_value('price', opt_iter.next()) yield loader.load_item()
def parse_product(self, response): loader = ProductLoader(response=response, item=Product()) condition = response.css('.condition span::text').extract_first() if 'Used' not in condition.title(): return identifier = response.url.split('/')[-1] loader.add_value('identifier', identifier) loader.add_xpath('sku', '//script/text()', re='skuCode": *"(.+)?"') categories = response.css('.f-breadcrumb a::text').extract()[1:-1] loader.add_xpath('brand', '//script/text()', re='manufacturerName": *"(.+)?"') loader.add_value('category', categories) loader.add_xpath('name', '//script/text()', re='fullProductName": *"(.+)?"') loader.add_xpath('price', '//script/text()', re='currentPrice": *([.\d]+)?') loader.add_value('url', response.url) loader.add_css('image_url', '.f-slideshow img::attr(src)') metadata = WexMeta() metadata['condition'] = condition product = loader.load_item() product['metadata'] = metadata yield product
def parse_product(self, response): data = response.xpath('//script/text()').re('{\\\\"Variants.+}')[0] data = json.loads(data.replace('\\"', '"')) variants = data['Variants'] for variant in variants: url = response.urljoin(variant['ProductPLU']) yield Request(make_variant_url(url), self.parse_product) loader = ProductLoader(item=Product(), response=response) identifier = response.xpath('//input[@id="ProductPLU"]/@value').extract_first() loader.add_value('identifier', identifier) loader.add_value('sku', identifier) loader.add_value('url', response.url) loader.add_xpath('name', '(//h1[@itemprop="name"]/text())[1]') metadata = {} for i in xrange(3): variant_name = data['Variant%dSelected' %(i+1)] if variant_name and variant_name != 'N/A': loader.add_value('name', variant_name) metadata[data['Variant%dHeader' %(i+1)]] = variant_name if 'size' in variant_name.lower(): metadata['size'] = variant_name[5:].strip() price = response.css('.price-value .currency::text').extract() loader.add_value('price', price.pop()) category = response.css('.breadcrumb a::text').extract() loader.add_value('category', category[1:]) loader.add_css('image_url', '.product-image::attr(src)') loader.add_xpath('brand', '//meta[@itemprop="brand"]/@content') loader.add_value('shipping_cost', '7.95') stock = response.css('.product-stock-widget::attr(ng-init)').re('AvailableOnline: (\w+)')[0] if stock != 'true': loader.add_value('stock', 0) item = loader.load_item() item['metadata'] = metadata yield item
def parse_product(self, response): loader = ProductLoader(Product(), response=response) identifier = response.xpath('//script/text()').re( 'ecomm_prodid: *(\d+),') loader.add_value('identifier', identifier) loader.add_value('url', response.url) name = ' '.join(''.join( response.xpath('//h1//text()').extract()).split()) loader.add_value('name', name) loader.add_css('price', 'span.GBP::attr(content)') loader.add_xpath('sku', '//span[@id="js-product-reference"]/@data-ref') category = response.xpath( '//div[contains(@class, "breadcrumb")]//a/span/text()').extract( )[1:] loader.add_value('category', category) image_url = response.xpath( '//a[@class="product__image__zoom-link"]/@href').extract() image_url = response.urljoin(image_url[0]) if image_url else '' loader.add_value('image_url', image_url) brand = response.xpath( '//span[@class="product-content__title--brand"]/text()').extract() brand = brand[0].strip() if brand else '' loader.add_value('brand', brand) stock = response.xpath( '//span[@id="js-product-in-stock-default" and contains(text(), "in Stock")]' ) if not stock: loader.add_value('stock', 0) yield loader.load_item()
def parse_product(self, response): loader = ProductLoader(Product(), response=response) identifier = response.css( 'input.baseProductCode::attr(value)').extract_first() loader.add_value('identifier', identifier) loader.add_value('sku', identifier) loader.add_value('url', response.url) name = response.css( 'h1.pdp-headline span.pdp-description::text').extract_first() loader.add_value('name', name) loader.add_css('price', 'p.pdp-price::text') category = response.css('div#breadcrumb a::text').extract()[:-1] category = [cat.strip() for cat in category] if 'Designer' in category: category.remove('Designer') loader.add_value('category', category) image_url = response.xpath('//@data-main-img-url').extract_first() if image_url: loader.add_value('image_url', response.urljoin(image_url)) brand = response.css('h1.pdp-headline a::text').extract_first() loader.add_value('brand', brand) stock = response.xpath('//@data-stl-json').re( '%s.+?stockLevelCode":"(.+?)"' % identifier) if stock and 'inStock' not in stock: loader.add_value('stock', 0) yield loader.load_item()
def parse_product(self, response): loader = ProductLoader(Product(), response=response) loader.add_value('url', response.url) loader.add_xpath('name', '//h1[@itemprop="name"]/text()') sku = response.xpath('//div[@itemprop="description"]/div/div[last()]/text()').extract_first() loader.add_value('identifier', sku) loader.add_value('sku', sku) category = response.css('.breadcrumbs a::text').extract()[1:] category += response.css('.breadcrumbs li:last-of-type::text').extract() loader.add_value('category', category) image_url = response.css('img.gallery-main-image::attr(src)').extract_first() if image_url: loader.add_value('image_url', response.urljoin(image_url)) if not response.css('.in-stock'): loader.add_value('stock', 0) item = loader.load_item() options = response.css('table.product-table tbody tr') for option in options: loader = ProductLoader(Product(), selector=option) loader.add_value(None, item) sku = option.css('span.product-code::text').re('\((.+)\)')[0] name = option.css('span.product-name::text').extract_first() identifier = '-'.join((sku, hashlib.md5(item['name'] + name).hexdigest())) loader.replace_value('identifier', identifier) loader.replace_value('sku', sku) loader.add_css('price', 'span.product-price-rrp') price = option.css('td.product-price').xpath('text()[last()]').extract_first() loader.replace_value('price', price) if name not in item['name']: loader.add_value('name', name) yield loader.load_item()
def parse_product(self, response): brand = response.meta['brand'] brands = response.meta['brands'] loader = ProductLoader(Product(), response=response) sku_searched = response.meta['sku'] sku = response.css('.part-number strong::text').extract_first() if not sku or sku.strip().upper() != sku_searched: return product_brand = response.xpath( '//tr[th[contains(text(), "Brand")]]/td[contains(@class, "data")]/text()' ).extract()[0] if product_brand.upper().strip() not in brands: return loader.add_value('identifier', sku) loader.add_value('url', response.url) loader.add_css('name', '.product-name .h1::text') loader.add_xpath( 'price', '//span[contains(@id, "price-excluding-tax")]/text()') loader.add_value('sku', sku) category = response.css('.breadcrumbs a::text').extract()[1:] loader.add_value('category', category) loader.add_css('image_url', 'img#image-main::attr(src)') loader.add_value('brand', brand) if response.css('.availability .out-of-stock'): loader.add_value('stock', 0) item = loader.load_item() if item['price'] < 50: item['shipping_cost'] = 5 yield item
def parse_product(self, response): loader = ProductLoader(Product(), response=response) loader.add_xpath('identifier', '//input[@name="productid"]/@value') loader.add_value('url', response.url) loader.add_css('name', '.descr::text') loader.add_css('price', 'span.currency::text') loader.add_value('sku', response.meta['sku']) image_url = response.css( 'img#product_thumbnail::attr(src)').extract_first() if image_url: loader.add_value('image_url', response.urljoin(image_url)) loader.add_value('brand', response.meta['brand']) stock = response.css('.quantity script::text').re( 'product_avail = (\d+);')[0] loader.add_value('stock', stock) item = loader.load_item() if stock == '0': yield item return request = FormRequest.from_response(response, formname='orderform', meta={ 'cookiejar': item['identifier'], 'item': Product(item) }, cookies=self.cookies, callback=self.parse_shipping, dont_filter=True) yield request
def parse_product(self, response): data = SpiderSchema(response).get_product() options = response.xpath( '//div[@class="summary-container"]/table//tr[not(th)]') for option in options: loader = ProductLoader(item=Product(), response=response) opt_name = option.xpath( './/td[contains(@class,"optionscol")]/text()')[0].extract() opt_name = u'{} - {}'.format(data['name'], opt_name) opt_identifier = option.xpath('@class')[0].extract().split(' ')[0] opt_price = option.xpath('@data-price').extract() loader.add_value('name', opt_name) loader.add_value('url', response.url) loader.add_value('sku', data['sku']) loader.add_value('identifier', opt_identifier) if 'image' in data: loader.add_value('image_url', data['image']) else: loader.add_xpath('image_url', '//meta[@itemprop="og:image"]/@content') stock = option.xpath('@class').re('instock') if not stock: loader.add_value('stock', 0) loader.add_value('price', opt_price) loader.add_css('category', 'div.product_meta span.posted_in a::text') yield loader.load_item()
def parse_product(self, response): loader = ProductLoader(Product(), response=response) identifier = re.search('\d\d\d\d', response.url).group(0) loader.add_value('identifier', identifier) loader.add_value('sku', identifier) loader.add_value('url', response.url) loader.add_xpath('name', '//header[@class="prodCat"]/h1/text()') category = response.css('.bread li a::text').extract()[1:] category += response.css('.bread li:last-child::text').extract() loader.add_value('category', category) image_url = response.css('.detimg a::attr(href)').extract_first() if image_url: loader.add_value('image_url', response.urljoin(image_url)) item = loader.load_item() options = response.css('.tbl').xpath('.//*[@class="tr"]') if not options: item['price'] = 0 yield item return for option in options: loader = ProductLoader(Product(), selector=option) loader.add_value(None, item) identifier = option.xpath('.//input/@name').extract_first() loader.replace_value('identifier', identifier) loader.replace_value('sku', identifier) loader.replace_css('price', '.tc-price .pr-now::text') loader.add_css('price', '.tc-price::text') loader.replace_css('name', '.tc-title::text') yield loader.load_item()
def parse_product(self, response): loader = ProductLoader(item=Product(), response=response) loader.add_value('category', response.meta['category']) loader.add_xpath('name', '//h1[@itemprop="name"]/text()') option_name = response.css('.label-select-container').xpath( './/option[@selected]/text()').extract() loader.add_value('name', option_name) item_identifier = response.xpath( '//input[@id="item_details_item_id"]/@value').extract_first() if not item_identifier: self.logger.warning('No identifier on %s' % response.url) identifier = item_identifier + '-' + response.xpath( '//input[@id="item_details_product_id"]/@value').extract_first() loader.add_value('identifier', identifier) loader.add_value('url', response.url) loader.add_xpath('price', '//meta[@itemprop="price"]/@content') sku = [] sku.append( response.css('.order-code').xpath( 'text()').extract_first().strip()) sku.extend(response.css('.order-code span::text').extract()) loader.add_value('sku', ' '.join(sku)) loader.add_xpath('image_url', '//img[@id="imageMain"]/@src') loader.add_css('brand', '.sku_kc_brand_id_ ::text') if loader.get_output_value('price') < 50: loader.add_value('shipping_cost', '2.99') stock = response.xpath( '//meta[@itemprop="availability"]/@content').extract_first() stock = stock.replace(' ', '').lower() if stock not in self.instock: loader.add_value('stock', 0) if stock not in self.outofstock: self.logger.warning('Undefined stock status for %s' % response.url) item = loader.load_item() if item['identifier'] not in self.identifiers: self.identifiers.add(item['identifier']) yield item attributes = [] options = [] for attribute in response.css('.label-select-container select'): attribute_name = attribute.xpath('@id').extract_first() attribute_name = attribute_name.replace('_%s' % item_identifier, '') attributes.append(attribute_name) options.append([]) for value in attribute.xpath('option/@value').extract(): options[-1].append(value) for variant in itertools.product(*options): url = 'http://www.kiddicare.com/ajax.get_exact_product.php?instart_disable_injection=true&item_id=%s' % item_identifier for n, option in enumerate(variant): url += '&attributes[%s]=%s' % (attributes[n], option) url = url.replace('+', '%2B') meta = response.meta meta['sku'] = sku meta['attributes'] = attributes yield Request(url, self.parse_option, meta=meta)
def parse_product(self, response): try: pdata = SpiderSchema(response).get_product() except: self.logger.error('No structured product data on %s' %response.url) return options = None js_line = '' for l in response.body.split('\n'): if 'variants:' in l: js_line = l break if js_line: options = demjson.decode(re.search(r'variants:(.*};)?', js_line).groups()[0][:-2].strip()) product_loader = ProductLoader(item=Product(), response=response) sku = response.css('span.pd_productVariant::text').extract_first() product_loader.add_css('sku', 'span.pd_productVariant::text') product_loader.add_xpath('identifier', '//input[@name="productId"]/@value') product_loader.add_value('url', response.url) try: product_loader.add_value('name', pdata['name']) except KeyError: return category = response.xpath('//*[@id="breadcrumb"]//a/text()').extract()[1:-1] product_loader.add_value('category', category) img = response.xpath('//meta[@property="og:image"]/@content').extract() if img: product_loader.add_value('image_url', response.urljoin(img.pop())) price = response.xpath('//p[@class="productOfferPrice"]/text()').extract()[0] product_loader.add_value('price', price) if product_loader.get_output_value('price') < 45: product_loader.add_value('shipping_cost', '3.5') brand = response.xpath('//*[@id="brandHeader"]/a/@href').extract() if brand: brand = brand[0].replace('/en/', '')[:-1] if '/' not in brand: product_loader.add_value('brand', brand) stock = response.xpath('//link[@itemprop="availability"]/@href').extract_first() if stock != 'http://schema.org/InStock': product_loader.add_value('stock', 0) product = product_loader.load_item() yield product if options: for k, val in options.items(): option_name = k.replace('_', ' ') option_product = Product(product) option_product['name'] = product['name'] + ' ' + option_name option_product['sku'] = val['productCode'] option_product['identifier'] = val['variantId'] option_product['price'] = extract_price(val['nowPrice']) yield option_product
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) try: name = response.css( '.content-fiche-produit h1::text').extract_first().strip() except: retry = int(response.meta.get('retry', 0)) if retry < 10: retry += 1 new_meta = response.meta.copy() new_meta['retry'] = retry yield Request(response.url, meta=new_meta, callback=self.parse_product, dont_filter=True) return category = response.css('#breadcrumb a::text').extract() if category: category = category[-2] else: category = "" sku = response.css('.content-fiche-produit p::text').re( u'Référence (\d+)') pid = response.css('.content-fiche-produit p::text').re(u'Ref (\d+)') price = response.css('.new-price ::text').extract_first() stock = bool( response.xpath( '//p[contains(@class, "in-stock")]/text()').extract()) if not stock: stock = 'DISPONIBLE' in ''.join( response.xpath('//p[contains(@class, "availability")]//text()' ).extract()).upper() if price: loader = ProductLoader(response=response, item=Product()) loader.add_value('url', urljoin(base_url, response.url)) loader.add_value('name', name) loader.add_css('image_url', '#image ::attr(src)') loader.add_value('price', extract_price2uk(price)) loader.add_value('category', category) loader.add_value('sku', sku) loader.add_value('identifier', pid) loader.add_value('brand', response.meta.get("brand", "")) #loader.add_value('stock', int(stock)) yield loader.load_item() else: self.errors.append("No price set for url: '%s'" % urljoin(base_url, response.url))
def parse_product(self, response): url = response.url l = ProductLoader(item=Product(), response=response) # name l.add_css('name', '.pro-des::text') # price price = '.'.join( response.xpath('//div[@class="price-strike"]/div/span//text()').re( '\d+')) l.add_value('price', price) # sku l.add_xpath('sku', '//div[@class="short-desc"]/span//text()') # identifier productid = response.xpath( '//input[@id="selectedProductIdd"]/@value').extract()[0] priceid = response.xpath('//input[@id="priceId"]/@value').extract()[0] identifier = '-'.join((productid, priceid)) l.add_value('identifier', identifier) # category l.add_xpath( 'category', "//div[@class='bread']//li[position() > 1]//text()[not(contains(., '>'))]" ) # product image l.add_xpath('image_url', "//meta[@property='og:image']/@content") # url l.add_value('url', url) # brand l.add_xpath('brand', '//div[@class="added-item"]/h2/text()') # shipping shipping_cost = 9.9 if l.get_output_value('price') < 200 else 0 l.add_value('shipping_cost', shipping_cost) product = l.load_item() if not price: storeid = response.xpath( '//input[@id="storeId"]/@value').extract()[0] url = 'http://www.courts.com.sg/home/addtocart.html?isAdd=true&newProduct=true&productId=%s&selectedCurrency=SGD&quantity=1&cartId=na&addQuantity=true&newQuantity=1&shippingOption=&shippingCity=&deliveryOption=&shippingDate=&cityId=&title=&inventorysensible=yes&priceId=%s&storeId=%s' yield Request(url % (productid, priceid, storeid), callback=self.parse_price_from_cart, meta={ 'product': Product(product), 'dont_merge_cookies': True }) else: yield product
def parse_product(self, response): if response.url.endswith('page-not-found.page'): return formdata = {} for inp in response.xpath('//form[@id="variant-form"]//input'): formdata[inp.xpath('@name').extract_first()] = inp.xpath( '@value').extract_first() if not formdata: self.logger.warning('No data on %s' % response.url) return del formdata[None] options = response.css('.vContainer .variantDataElement') for option in options: formdata[option.xpath('@name').extract_first()] = option.xpath( '@data-variant-value').extract_first() r = FormRequest.from_response( response, formxpath='//form[@id="variant-form"]', formdata=formdata, callback=self.parse_product) yield r loader = ProductLoader(item=Product(), response=response) sku = response.xpath('//input[@id="skuIdVal"]/@value').extract_first() if sku != url_query_parameter(response.url, 'skuId'): url = add_or_replace_parameter(url_query_cleaner(response.url), 'skuId', sku) yield Request(url, self.parse_product) return loader.add_value('identifier', sku) loader.add_value('url', response.url) loader.add_xpath('name', '//h1[@id="productLabel"]//text()') #loader.add_css('name', '.selected .variantDisplayName_title ::text') loader.add_css('price', '.current-price ::text') loader.add_value('sku', sku) category = response.xpath( '//div[@id="breadcrumb"]//li//span[@itemprop="title"]/text()' ).extract() loader.add_value('category', category[-4:-1]) image_url = response.xpath( '//img[@itemprop="image"]/@src').extract_first() if image_url: loader.add_value('image_url', response.urljoin(image_url)) loader.add_xpath( 'brand', '//div[@itemprop="brand"]//span[@itemprop="name"]/text()') loader.add_value('shipping_cost', 3) #if not response.css('.stock-tag.in-stock') and not response.xpath('//link[@href="http://schema.org/InStock"]') and not response.css('.available-from'): if not response.css('.add-to-basket'): loader.add_value('stock', 0) if loader.get_output_value('price'): yield loader.load_item()
def parse_product(self, response): if 'aspxerrorpath' in response.url: yield Request(response.request.meta['redirect_urls'][0], self.parse_product, dont_filter=True) return loader = ProductLoader(Product(), response=response) identifier = response.xpath('//@data-feefo-vendor-ref').extract_first() loader.add_value('identifier', identifier) loader.add_value('url', response.url) loader.add_css('name', 'header.page-title h1::text') loader.add_css('price', 'header.product-sidebar__price h2::text') loader.add_value('sku', identifier) category = response.css('.breadcrumb a::text').extract() loader.add_value('category', category[1:-1]) image_url = response.css( '.product-gallery__main-image img::attr(src)').extract_first() if image_url: loader.add_value('image_url', response.urljoin(image_url)) stock = response.css('.product-sidebar__stock::text').extract_first() if not 'Order Now' in stock.title(): loader.add_value('stock', 0) item = loader.load_item() if 'Discontinued' in stock.title(): item['metadata'] = {"Discontinued?": "Yes"} option_types = response.css('.product-sidebar select') if not option_types: yield item return options = [] for option_type in option_types: options.append(option_type.xpath('option[@value!="Select"]')) variants = itertools.product(*options) for variant in variants: loader = ProductLoader(Product(), response=response) loader.add_value(None, item) identifier = item['identifier'] for option in variant: loader.add_value('name', option.xpath('text()').extract()) identifier += '-' + option.xpath('@value').extract_first() loader.replace_value('identifier', identifier) loader.replace_value('sku', identifier) option_item = loader.load_item() option_item['metadata'] = item.get('metadata', {}) yield option_item
def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), response=response) loader.add_xpath('name', '//input[@id="product-name"]/@value') loader.add_value('url', response.url) loader.add_css('brand', 'span.b-brand_title::text') categories = response.css('div.b-breadcrumbs a::text').extract()[2:] loader.add_value('category', categories) loader.add_xpath('sku', '//meta[@itemprop="model"]/@content') identifier = response.xpath('//input[@name="pid"]/@value').extract() if not identifier: log.msg('PRODUCT WHIOUT IDENTIFIER: ' + response.url) return loader.add_value('identifier', identifier[0]) image_url = response.xpath('//link[@rel="image_src"]/@href').extract( ) or response.xpath('//meta[@itemprop="image"]/@content').extract() if image_url: loader.add_value('image_url', image_url[0]) price = response.xpath('//meta[@itemprop="price"]/@content').extract() loader.add_value('price', price) out_of_stock = response.css('div.b-availability').xpath( './/span[@data-availability="NOT_AVAILABLE"]') if out_of_stock: loader.add_value('stock', '0') product = loader.load_item() promo = response.xpath( '//div[@class="b-product_promo"]/div/span/text()').extract() metadata = ToyMonitorMeta() metadata['reviews'] = [] if promo: metadata['promotions'] = promo[0].strip() product['metadata'] = metadata reviews_url = 'http://mark.reevoo.com/reevoomark/en-GB/product.html?page=1&sku=%s&tab=reviews&trkref=MOT' yield Request(reviews_url % identifier[0], callback=self.parse_review_page, meta={'product': product})
def parse_products(self, response): category = response.xpath( '//div[@id="breadcrumb"]//span[@itemprop="name"]/text()').extract( )[2:] for product in response.css('.productList .product'): loader = ProductLoader(item=Product(), selector=product) loader.add_xpath('identifier', '@id', re='product-(.+)') loader.add_xpath('url', './/@href') brand = product.xpath('.//h3/em/text()').extract_first() name = product.xpath('.//h3/span/text()').extract_first() if name[0].islower(): loader.add_value('name', brand) loader.add_value('name', name) loader.add_css('price', '.productPrice dd:last-child::text') loader.add_xpath('sku', '@id', re='product-(.+)') loader.add_value('category', category) loader.add_css('image_url', '.productMainImage img::attr(src)') image_url = loader.get_output_value('image_url') promotion = None if image_url and '3for2' in image_url: promotion = '3 for 2' loader.add_value('brand', brand) loader.add_value('shipping_cost', '3.99') stock = product.css('.productStock dd').extract_first().title() if 'In Stock' not in stock and 'Low Stock' not in stock: loader.add_value('stock', 0) product = loader.load_item() metadata = ToyMonitorMeta() metadata['reviews'] = [] if promotion: metadata['promotions'] = promotion product['metadata'] = metadata prod_id = re.findall("/(\d+).prd", product['url'])[0] reviews_url = "http://api.bazaarvoice.com/data/batch.json?passkey=35w0b6mavcfmefkhv3fccjwcc&apiversion=5.5&displaycode=17045-en_gb&resource.q0=reviews&filter.q0=isratingsonly%3Aeq%3Afalse&filter.q0=productid%3Aeq%3A" + prod_id + "&filter.q0=contentlocale%3Aeq%3Aen_GB%2Cen_IE%2Cen_US&sort.q0=isfeatured%3Adesc&stats.q0=reviews&filteredstats.q0=reviews&include.q0=authors%2Cproducts%2Ccomments&filter_reviews.q0=contentlocale%3Aeq%3Aen_GB%2Cen_IE%2Cen_US&filter_reviewcomments.q0=contentlocale%3Aeq%3Aen_GB%2Cen_IE%2Cen_US&filter_comments.q0=contentlocale%3Aeq%3Aen_GB%2Cen_IE%2Cen_US&limit.q0=100&offset.q0=0&limit_comments.q0=3&callback=bv_1111_57043" request = Request(reviews_url, meta={ 'product': product, 'offset': 0 }, callback=self.parse_reviews) yield request
def parse_product(self, response): loader = ProductLoader(Product(), response=response) loader.add_value('url', response.url) category = response.css('div.treemenu a::text').extract()[1:] loader.add_value('category', category) loader.add_css('image_url', 'div#mainimage_holder img::attr(data-zoom-image)') identifier = response.xpath('//input[@name="fproduct_id"]/@value').extract_first() loader.add_value('identifier', identifier) loader.add_value('sku', identifier) loader.add_xpath('name', '//h1[@itemprop="name"]/text()') loader.add_css('price', 'li.shelfBnormalprice::text') if loader.get_output_value('price') < 100: loader.add_value('shipping_cost', 10) item = loader.load_item() attributes = response.css('table.variabletable tr') attributes = [attr for attr in attributes if attr.xpath('td[1]/text()').extract_first() in self.options_to_extract] options = [] for attr in attributes: options.append(attr.xpath('td/select/option[not(contains(.,"Please Select"))]')) variants = itertools.product(*options) if not variants: yield item return for variant in variants: loader = ProductLoader(Product(), response=response) loader.add_value(None, item) identifier = item['identifier'] price = item['price'] for option in variant: identifier += '-' + option.xpath('@value').extract_first() name_and_price = option.xpath('text()').extract_first().split('(Add') loader.add_value('name', name_and_price[0]) if len(name_and_price) >1: price += extract_price(name_and_price[1]) loader.replace_value('identifier', identifier) loader.replace_value('sku', identifier) loader.replace_value('price', price) if price >= 100: loader.replace_value('shipping_cost', 0) yield loader.load_item()
def parse_product(self, response): loader = ProductLoader(Product(), response=response) identifier = response.css('span#thisstkcode::text').extract_first() if not identifier: retries = response.meta.get('retries', 0) if retries > 9: self.logger.warning('No identifier found on %s' % response.url) else: self.logger.debug('Retry %s to get identifier' % response.url) meta = response.meta meta['retries'] = retries + 1 yield response.request.replace('dont_filter=True', meta=meta) loader.add_value('identifier', identifier) loader.add_value('sku', identifier) loader.add_value('url', response.url) loader.add_xpath('name', '//h1/text()') price = response.css('span.prodPrice').xpath( './/span[@itemprop="price"]/text()').extract_first() loader.add_value('price', price) category = response.css('.breadcrumbs span::text').extract()[1:] loader.add_value('category', category) loader.add_css('image_url', '.main-product-photo::attr(href)') loader.add_css('brand', 'span#thisbrand::text') loader.add_css('stock', 'input#data-stock-qty::attr(value)') yield loader.load_item()
def parse_product(self, response): options = response.css('.pg_select') if options: selected_option = options.xpath('option[@selected]') if not selected_option: for url in options.xpath('.//@data-href').extract(): yield Request(response.urljoin(url_query_cleaner(url)), self.parse_product) return loader = ProductLoader(Product(), response=response) sku = response.xpath( '//div[@id="content"]//input[@name="sku"]/@value').extract_first() loader.add_value('identifier', sku) loader.add_value('sku', sku) loader.add_value('url', response.url) loader.add_xpath('name', '//strong[@itemprop="name"]/text()') loader.add_css('price', 'div.show h5 ::text') loader.add_css('price', '.nowPrice ::text') loader.add_css('price', '.typicalPrice h5 ::text') category = response.xpath('//input[@name="productDetailsDTO"]/@value' ).re('"category":"(.+?)"') if category: loader.add_value('category', category[0].split('/')) image_url = response.css( 'ul#galleryImages a::attr(href)').extract_first() if image_url: loader.add_value('image_url', response.urljoin(image_url)) loader.add_xpath( 'brand', '//span[@itemprop="brand"]//span[@itemprop="name"]/text()') if response.css('div#content p.oos'): loader.add_value('stock', 0) yield loader.load_item()
def parse_product(self, response): loader = ProductLoader(Product(), response=response) identifier = response.xpath( '//input[@name="productId"]/@value').extract_first() if not identifier: loader.add_value('stock', 0) identifier = response.xpath('//text()').re('productId=(.+?)&') loader.add_value('identifier', identifier) loader.add_value('url', url_query_cleaner(response.url)) loader.add_css('name', 'div.productTitleDescriptionContainer h1::text') loader.add_css('price', 'p.pricePerUnit::text') loader.add_css('sku', 'p.itemCode::text', re='Item code:(.+)') category = response.xpath( '//ul[@id="breadcrumbNavList"]//a/span/text()').extract() if 'Home' in category: category.remove('Home') loader.add_value('category', category) image_url = response.css( 'img#productImageID::attr(src)').extract_first() if image_url: loader.add_value('image_url', response.urljoin(image_url)) item = loader.load_item() item['metadata'] = {'reviews': []} review_id = response.xpath('//text()').re_first("productId: '(.+?)'") reviews_url = 'http://sainsburysgrocery.ugc.bazaarvoice.com/8076-en_gb/%s/reviews.djs?format=embeddedhtml' % review_id yield Request(reviews_url, callback=self.parse_review_page, meta={'item': item})
def parse_product(self, response): if 'contact-lenses' in response.url: for item in self.parse_lenses(response): yield item return loader = ProductLoader(item=Product(), response=response) loader.add_xpath('identifier', '//input[@name="SKU"]/@value') loader.add_xpath('url', '//link[@rel="canonical"]/@href') loader.add_xpath('name', '//ul[@id="Brand"]/li[position()>1]//text()', re='.+') loader.add_css('price', '.itemPrice ::text') loader.add_xpath('sku', '//span[@itemprop="sku"]/text()') category = response.css('.breadcrumb span::text').extract() loader.add_value('category', category[1:-1]) image_url = response.css('.currentImage ::attr(src)').extract_first() loader.add_value('image_url', response.urljoin(image_url)) loader.add_xpath('brand', '//ul[@id="Brand"]/li[2]/strong/text()') if response.xpath( '//div[@id="Order"]//link/@href[contains(., "OutOfStock")]'): loader.add_value('stock', 0) yield loader.load_item()
def parse_product(self, response): if 'login.cfm' in response.url: return loader = ProductLoader(Product(), response=response) identifier = response.url.split('/')[-1] identifier = hashlib.md5(identifier).hexdigest() loader.add_value('identifier', identifier) loader.add_value('url', response.url) loader.add_css('name', 'h1.content-title::text') loader.add_xpath('price', '//script/text()', re='price": "(.+)"') loader.add_xpath('sku', '//script/text()', re='sku": "(.+)"') category = response.xpath( '//ul[@id="breadcrumbs"][1]//a/text()').extract()[1:-1] loader.add_value('category', category) image_url = response.css( 'div.product-detail-feature-img img::attr(src)').extract_first() if image_url: loader.add_value('image_url', response.urljoin(image_url)) loader.add_xpath('brand', '//meta[@property="og:brand"]/@content') stock = response.xpath('//script/text()').re('availability": "(.+)"') if stock and stock[0] != 'In stock': loader.add_value('stock', 0) yield loader.load_item()
def parse_product(self, response): loader = ProductLoader(Product(), response=response) identifier = response.xpath( '//input[@name="product_id"]/@value').extract_first( ) or response.xpath( '//input[@name="add-to-cart"]/@value').extract_first() if not identifier: loader.add_value('stock', 0) identifier = response.xpath( '//div[@itemtype="http://schema.org/Product"]/@id').re_first( 'product-(\d+)') loader.add_value('identifier', identifier) loader.add_css('sku', 'span.sku::text') loader.add_value('url', response.url) loader.add_xpath('name', '//h1[@itemprop="name"]/text()') loader.add_css('price', '.product-price-exvat span.amount::text') loader.add_css('price', '.product-price span.amount::text') category = response.xpath( '//span[@class="posted_in"][contains(., "Categories:")]/a/text()' ).extract_first() loader.add_value('category', category) loader.add_css('image_url', 'div.single-product-main-image a::attr(href)') brand = response.xpath( '//span[@class="posted_in"][contains(., "Brands:")]/a/text()' ).extract_first() loader.add_value('brand', brand) item = loader.load_item() variations = response.xpath( '//@data-product_variations').extract_first() if not variations: yield item return variations = json.loads(variations) for variant in variations: loader = ProductLoader(Product(), response=response) loader.add_value(None, item) loader.replace_value('identifier', variant['variation_id']) loader.replace_value('sku', variant['sku']) loader.replace_value('price', variant['display_price']) if variant['image_link']: loader.replace_value('image_url', variant['image_link']) loader.add_value('name', variant['attributes'].values()) yield loader.load_item()
def parse_simple_product(self, response): loader = ProductLoader(Product(), response=response) loader.add_xpath('identifier', '//input[@name="product"]/@value') loader.add_value('url', response.url) loader.add_css('name', 'div.product-name h1::text') loader.add_css('price', 'li.bigPrice span.price::text') loader.add_xpath('sku', '//input[@name="product"]/@value') category = response.css('div.breadcrumbs a::text').extract()[1:] loader.add_value('category', category) loader.add_css('image_url', 'img#image::attr(src)') item = loader.load_item() yield item
def parse_lenses(self, response): loader = ProductLoader(item=Product(), response=response) identifier = response.xpath( '//input[@name="id"]/@value').extract_first() id_tipo = response.xpath( '//input[@name="id_tipo"]/@value').extract_first() if id_tipo: identifier += '-' + id_tipo loader.add_value('url', response.url) loader.add_css('name', '.nombre ::text') loader.add_xpath('price', '//*[@itemprop="price"]/text()') loader.add_css('category', '.breadcrumb a::text') loader.add_css('image_url', '.pag_producto img::attr(src)') loader.add_xpath('brand', '//meta[@itemprop="brand"]/@content') loader.add_value('identifier', identifier) loader.add_value('sku', identifier) yield loader.load_item()
def parse_product(self, response): loader = ProductLoader(item=Product(), response=response) css = '.nosto_product .%s ::text' loader.add_css('identifier', css % 'product_id') loader.add_css('sku', css % 'product_id') for field in ('url', 'name', 'image_url', 'brand'): loader.add_css(field, css % field) list_price = response.css(css % 'list_price').extract_first() sales_price = response.css(css % 'price').extract_first() loader.add_value('price', list_price) if 'InStock' not in response.css(css % 'availability').extract_first(): loader.add_value('stock', 0) category = response.css(css % 'category').extract_first() loader.add_value('category', category.split('/')[-1]) options_data = response.xpath('//script/text()').re( 'Product.Config.({.+})') if not options_data: item = loader.load_item() if sales_price != list_price: item['metadata'] = {'SalesPrice': Decimal(sales_price)} yield item return options_data = json.loads(options_data[0]) if len(options_data['attributes']) > 1: self.log('More than one options attributes found on %s' % response.url) return price = loader.get_output_value('price') name = loader.get_output_value('name') sales_price = Decimal(sales_price) for option in options_data['attributes'].values()[0]['options']: new_price = sales_price + Decimal(option['price']) loader.replace_value('price', price + Decimal(option['oldPrice'])) loader.replace_value('name', name + ' ' + option['label']) loader.replace_value('identifier', option['products'][0]) loader.replace_value('sku', option['products'][0]) loader.replace_xpath( 'image_url', '//li[@id="simple-product-image-%s"]/a/@href' % option['products'][0]) item = loader.load_item() if price + Decimal(option['oldPrice']) != new_price: item['metadata'] = {'SalesPrice': new_price} yield item
def parse_product(self, response): loader = ProductLoader(Product(), response=response) identifier = re.search('(\d+)_BQ', response.url).group(1) loader.add_value('identifier', identifier) loader.add_value('url', response.url) loader.add_css('name', '.product-summary h1.product-title::text') loader.add_css('price', '.product-price::attr(content)') loader.add_css('sku', 'dl.product-code dd::text') loader.add_value('category', 'Bedroom') category = response.css('.breadcrumb').xpath( './/li/a/text()').extract()[-1] loader.add_value('category', category) image_url = response.css('.main-img img::attr(src)').extract_first() if image_url: loader.add_value('image_url', response.urljoin(image_url)) loader.add_xpath('brand', '//th[text()="Brand"]/following-sibling::td/text()') if loader.get_output_value('price') < 50: loader.add_value('shipping_cost', 5) yield loader.load_item()
def parse_product_options_config(self, response): options = response.xpath('//script/text()').re_first( 'Product.Config.*?({.+})') loader = ProductLoader(Product(), response=response) loader.add_xpath('identifier', '//input[@name="product"]/@value') loader.add_value('url', response.url) loader.add_css('name', 'div.product-name h1::text') loader.add_css('price', 'li.bigPrice span.price::text') loader.add_xpath('sku', '//input[@name="product"]/@value') category = response.css('div.breadcrumbs a::text').extract()[1:] loader.add_value('category', category) loader.add_css('image_url', 'img#image::attr(src)') item = loader.load_item() if not options: yield item return options = json.loads(options) attributes = sorted(options['attributes'].values()) products = [ option['products'] for attr in attributes for option in attr['options'] ] products = set(itertools.chain(*products)) for product in products: loader = ProductLoader(Product(), response=response) loader.add_value(None, item) identifier = item['identifier'] + '-' + product loader.replace_value('identifier', identifier) loader.replace_value('sku', identifier) options = [ option for attr in attributes for option in attr['options'] if product in option['products'] ] price = item['price'] for option in options: loader.add_value('name', option['label']) price += Decimal(option['price']) loader.replace_value('price', price) yield loader.load_item()