def parse_product(self, response): loader = ProductLoader(Product(), response=response) loader.add_xpath('identifier', '//input[@name="productid"]/@value') loader.add_value('url', response.url) loader.add_css('name', '.descr::text') loader.add_css('price', 'span.currency::text') loader.add_value('sku', response.meta['sku']) image_url = response.css( 'img#product_thumbnail::attr(src)').extract_first() if image_url: loader.add_value('image_url', response.urljoin(image_url)) loader.add_value('brand', response.meta['brand']) stock = response.css('.quantity script::text').re( 'product_avail = (\d+);')[0] loader.add_value('stock', stock) item = loader.load_item() if stock == '0': yield item return request = FormRequest.from_response(response, formname='orderform', meta={ 'cookiejar': item['identifier'], 'item': Product(item) }, cookies=self.cookies, callback=self.parse_shipping, dont_filter=True) yield request
def parse_product_base(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) image_xpath = '//div[@id="image-block"]//img[@itemprop="image"]/@src' breadcrumb = response.css('div.breadcrumb a span::text').extract() if len(breadcrumb) > 0: category = breadcrumb.pop().strip() else: category = '' try: name = response.css('div.primary_block h1::text').extract_first().strip() except: return product_brand = '' for brand in self.brands: if brand.lower() in category.lower() or name.lower().startswith(brand.lower()): product_brand = brand break allow_buy_out_stock = re.search('var allowBuyWhenOutOfStock = true;', response.body) image = hxs.select(image_xpath).extract().pop() product_url = urljoin_rfc(base_url, response.url) image_url = urljoin_rfc(base_url, image) # "var quantityAvailable = 7" means there are in total 7 products available in stock quantity = re.search('var quantityAvailable\D+(\d+)', response.body) product_id = re.search('var id_product\D+(\d+)', response.body) price = response.xpath('//span[@id="our_price_display"]//text()').extract() if price: price = price.pop() else: price = '0.00' loader = ProductLoader(response=response, item=Product()) loader.add_value('url', product_url) loader.add_value('name', name) loader.add_value('brand', product_brand) loader.add_value('image_url', image_url) loader.add_value('price', price.replace(' ', '').replace(',', '.')) loader.add_value('category', category) loader.add_xpath('sku', '//p[@id="product_reference"]/span/text()') if product_id: loader.add_value('identifier', product_id.group(1)) else: loader.add_xpath('identifier', '//form//input[@name="id_product"]/@value') stock = response.xpath('//span[@id="availability_value"]/text()').extract_first() if stock and stock.title() != 'In Stock': loader.add_value('stock', 0) return loader.load_item()
def parse_product(self, response): base_url = get_base_url(response) name = response.xpath('//h1[@class="product-view__title"]/span/text()').extract() name = map(lambda x: x.strip(), name) name = ' '.join(name) loader = ProductLoader(item=Product(), response=response) loader.add_value('name', name) loader.add_xpath('price', '//div[contains(@class, "product-view__total-price")]/@data-price') image_url = response.xpath('//img[@itemprop="image"]/@alt').extract() if image_url: loader.add_value('image_url', 'http:' + image_url[0]) loader.add_xpath('brand', '//div[@class="product-view__brand brand"]/img[@class="brand__image"]/@alt') loader.add_value('category', 'Kontaktlinser') loader.add_value('url', response.url) identifier = re.findall('"ecomm_prodid":"(\d+)","', response.body)[0] loader.add_value('identifier', identifier) loader.add_value('sku', identifier) metadata = SpecSaversMeta() promotion = response.xpath('//section[contains(@class, "product-view--product-page")]//figcaption[@class="splash__inner"]//text()').extract() if promotion: promotion = [s for s in map(lambda x: x.strip(), promotion) if s != ''] promotion = ' '.join(promotion) else: promotion = '' metadata['promotion'] = promotion item = loader.load_item() item['metadata'] = metadata yield item
def parse_product(self, response): loader = ProductLoader(Product(), response=response) identifier = re.search('\d\d\d\d', response.url).group(0) loader.add_value('identifier', identifier) loader.add_value('sku', identifier) loader.add_value('url', response.url) loader.add_xpath('name', '//header[@class="prodCat"]/h1/text()') category = response.css('.bread li a::text').extract()[1:] category += response.css('.bread li:last-child::text').extract() loader.add_value('category', category) image_url = response.css('.detimg a::attr(href)').extract_first() if image_url: loader.add_value('image_url', response.urljoin(image_url)) item = loader.load_item() options = response.css('.tbl').xpath('.//*[@class="tr"]') if not options: item['price'] = 0 yield item return for option in options: loader = ProductLoader(Product(), selector=option) loader.add_value(None, item) identifier = option.xpath('.//input/@name').extract_first() loader.replace_value('identifier', identifier) loader.replace_value('sku', identifier) loader.replace_css('price', '.tc-price .pr-now::text') loader.add_css('price', '.tc-price::text') loader.replace_css('name', '.tc-title::text') yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) loader = ProductLoader(selector=hxs, item=Product()) loader.add_value('url', response.url) loader.add_xpath( 'brand', './/dt[text()="Brand"]/following-sibling::dd[1]/text()') loader.add_xpath('category', './/div[contains(@class, "breadcrumbs")]//a/text()') if hxs.select('//article[@id="product"]'): image_url = hxs.select( './/div[@id="amplienceContent"]//img/@src').extract() loader.replace_value('image_url', urljoin(base_url, image_url[0])) options = hxs.select( '//script[@type="text/javascript"]/text()[contains(., "productData")]' ).extract() for item in self.parse_options(hxs, base_url, loader, options): yield item for product in hxs.select('//article[@class="bdp-item"]'): image_url = product.select( './/a[contains(@id, "mainImage")]/img/@src').extract()[0] loader.replace_value('image_url', urljoin(base_url, image_url)) options = product.select( './div/div[1]//script[@type="text/javascript"]/text()' ).extract() for item in self.parse_options(product, base_url, loader, options): yield item
def parse_product(self, response): flix = '//script[@type="text/javascript"]/@data-flix-%s' name = response.xpath('//td/div[@align="center"]/b/text()').extract() if not name: return loader = ProductLoader(item=Product(), response=response) loader.add_value('name', name[0].strip(' ,')) loader.add_value('url', response.url) identifier = filter(lambda s: bool(s.strip()), response.xpath(flix % 'ean').extract()) if not identifier or not identifier[0]: identifier = response.xpath( '//b[contains(text(), "Model :")]/../text()[1]').extract() sku = response.xpath(flix % 'mpn').extract() if not sku or not sku[0]: sku = response.xpath( '//b[contains(text(), "Model")]/../text()[1]').extract() loader.add_value('identifier', identifier) loader.add_value('sku', sku) price = re.findall(u'POST.+?> *€(.+?) *<', response.body) loader.add_value('price', price) loader.add_xpath('category', '//h8//a[position()>1]/text()') loader.add_xpath('brand', flix % 'brand') stock = response.xpath( '//button[@value="Central Warehouse"]/../text()').extract_first() if not stock or 'Available' not in stock: loader.add_value('stock', 0) item = loader.load_item() if response.xpath('//img[@alt="Exdisplay"]'): item['metadata'] = {'Ex Display': 'Ex Display'} yield item
def parse_doors(self, response): url = response.xpath('//link[@rel="canonical"]/@href').extract() category = response.xpath( '//p[@class="breadcrumbs"]/a[position()>1]/text()').extract() ids = response.xpath('//script/text()').re('ecomm_prodid.*(\[.+\])') ids = eval(ids[0]) for i, product in enumerate( response.xpath('//div[@itemprop="offers"]')): loader = ProductLoader(item=Product(), selector=product) loader.add_xpath('name', './/h3[@itemprop="name"]/a/text()[1]') loader.add_value('identifier', ids[i]) loader.add_value('sku', ids[i]) loader.add_xpath('price', './/span[@itemprop="price"]/text()') local_url = product.xpath( './/h3[@itemprop="name"]/a/@href').extract() if local_url: local_url = response.urljoin(local_url[0]) else: local_url = url loader.add_value('url', local_url) image_url = product.xpath('.//a/img/@src').extract() loader.add_value('image_url', response.urljoin(image_url[0])) loader.add_value('category', category) if not product.xpath( 'link[@itemprop="availability"][@href="http://schema.org/InStock"]' ): loader.add_value('stock', 0) if loader.get_output_value('price') < 750: loader.add_value('shipping_cost', 36) yield loader.load_item()
def parse_product(self, response): base_url = get_base_url(response) name = response.xpath( '//h1[@class="product-view__title"]/span/text()').extract() name = map(lambda x: x.strip(), name) name = ' '.join(name) loader = ProductLoader(item=Product(), response=response) loader.add_value('name', name) loader.add_xpath( 'price', '//div[contains(@class, "product-view__total-price")]/@data-price') image_url = response.xpath('//img[@itemprop="image"]/@alt').extract() if image_url: loader.add_value('image_url', 'http:' + image_url[0]) loader.add_xpath( 'brand', '//div[@class="product-view__brand brand"]/img[@class="brand__image"]/@alt' ) loader.add_value('category', 'Kontaktlinser') loader.add_value('url', response.url) identifier = re.findall('"ecomm_prodid":"(\d+)","', response.body)[0] loader.add_value('identifier', identifier) loader.add_value('sku', identifier) yield loader.load_item()
def parse_product(self, response): data = SpiderSchema(response).get_product() options = response.xpath( '//div[@class="summary-container"]/table//tr[not(th)]') for option in options: loader = ProductLoader(item=Product(), response=response) opt_name = option.xpath( './/td[contains(@class,"optionscol")]/text()')[0].extract() opt_name = u'{} - {}'.format(data['name'], opt_name) opt_identifier = option.xpath('@class')[0].extract().split(' ')[0] opt_price = option.xpath('@data-price').extract() loader.add_value('name', opt_name) loader.add_value('url', response.url) loader.add_value('sku', data['sku']) loader.add_value('identifier', opt_identifier) if 'image' in data: loader.add_value('image_url', data['image']) else: loader.add_xpath('image_url', '//meta[@itemprop="og:image"]/@content') stock = option.xpath('@class').re('instock') if not stock: loader.add_value('stock', 0) loader.add_value('price', opt_price) loader.add_css('category', 'div.product_meta span.posted_in a::text') yield loader.load_item()
def parse_product(self, response): loader = ProductLoader(item=Product(), response=response) loader.add_value('identifier', re.findall('product_id.+?(\d+)', response.body)) loader.add_xpath('url', '//link[@rel="canonical"]/@href') loader.add_value('name', re.findall('"name":"(.+?)"', response.body)) prices = re.findall('tier_price_total".+?([\d.]+)', response.body) if not prices: return price = Decimal(prices[0]).quantize(Decimal('.01')) loader.add_value('price', price) loader.add_value('sku', re.findall('product_id.+?(\d+)', response.body)) category = re.findall( '<span class="technical_label">Lenstype:</span><a href.+?>(.+?)</a', response.body ) or re.findall( '<span class="technical_label">Producttype:</span><a href.+?>(.+?)</a', response.body) loader.add_value('category', category) loader.add_value( 'image_url', re.findall('<img src="(\S+media/catalog/product\S+)"', response.body)) loader.add_value( 'brand', re.findall( '<span class="technical_label">Merk:</span><a href.+?>(.+?)</a', response.body)) if loader.get_output_value('price') < 70: loader.add_value('shipping_cost', '4.98') yield loader.load_item()
def parse_product(self, response): loader = ProductLoader(Product(), response=response) identifier = response.xpath('//script/text()').re( 'ecomm_prodid: *(\d+),') loader.add_value('identifier', identifier) loader.add_value('url', response.url) name = ' '.join(''.join( response.xpath('//h1//text()').extract()).split()) loader.add_value('name', name) loader.add_css('price', 'span.GBP::attr(content)') loader.add_xpath('sku', '//span[@id="js-product-reference"]/@data-ref') category = response.xpath( '//div[contains(@class, "breadcrumb")]//a/span/text()').extract( )[1:] loader.add_value('category', category) image_url = response.xpath( '//a[@class="product__image__zoom-link"]/@href').extract() image_url = response.urljoin(image_url[0]) if image_url else '' loader.add_value('image_url', image_url) brand = response.xpath( '//span[@class="product-content__title--brand"]/text()').extract() brand = brand[0].strip() if brand else '' loader.add_value('brand', brand) stock = response.xpath( '//span[@id="js-product-in-stock-default" and contains(text(), "in Stock")]' ) if not stock: loader.add_value('stock', 0) yield loader.load_item()
def parse_product(self, response): loader = ProductLoader(Product(), response=response) loader.add_value('url', response.url) loader.add_xpath('name', '//h1[@itemprop="name"]/text()') sku = response.xpath('//div[@itemprop="description"]/div/div[last()]/text()').extract_first() loader.add_value('identifier', sku) loader.add_value('sku', sku) category = response.css('.breadcrumbs a::text').extract()[1:] category += response.css('.breadcrumbs li:last-of-type::text').extract() loader.add_value('category', category) image_url = response.css('img.gallery-main-image::attr(src)').extract_first() if image_url: loader.add_value('image_url', response.urljoin(image_url)) if not response.css('.in-stock'): loader.add_value('stock', 0) item = loader.load_item() options = response.css('table.product-table tbody tr') for option in options: loader = ProductLoader(Product(), selector=option) loader.add_value(None, item) sku = option.css('span.product-code::text').re('\((.+)\)')[0] name = option.css('span.product-name::text').extract_first() identifier = '-'.join((sku, hashlib.md5(item['name'] + name).hexdigest())) loader.replace_value('identifier', identifier) loader.replace_value('sku', sku) loader.add_css('price', 'span.product-price-rrp') price = option.css('td.product-price').xpath('text()[last()]').extract_first() loader.replace_value('price', price) if name not in item['name']: loader.add_value('name', name) yield loader.load_item()
def parse_product(self, response): options = response.css('.pg_select') if options: selected_option = options.xpath('option[@selected]') if not selected_option: for url in options.xpath('.//@data-href').extract(): yield Request(response.urljoin(url_query_cleaner(url)), self.parse_product) return loader = ProductLoader(Product(), response=response) sku = response.xpath( '//div[@id="content"]//input[@name="sku"]/@value').extract_first() loader.add_value('identifier', sku) loader.add_value('sku', sku) loader.add_value('url', response.url) loader.add_xpath('name', '//strong[@itemprop="name"]/text()') loader.add_css('price', 'div.show h5 ::text') loader.add_css('price', '.nowPrice ::text') loader.add_css('price', '.typicalPrice h5 ::text') category = response.xpath('//input[@name="productDetailsDTO"]/@value' ).re('"category":"(.+?)"') if category: loader.add_value('category', category[0].split('/')) image_url = response.css( 'ul#galleryImages a::attr(href)').extract_first() if image_url: loader.add_value('image_url', response.urljoin(image_url)) loader.add_xpath( 'brand', '//span[@itemprop="brand"]//span[@itemprop="name"]/text()') if response.css('div#content p.oos'): loader.add_value('stock', 0) yield loader.load_item()
def parse_product(self, response): identifier = response.xpath('//div[@itemscope]/@id').re('product-(.+)') loader = ProductLoader(item=Product(), response=response) loader.add_xpath('name', '//h1[@itemprop="name"]/text()') loader.add_xpath('price', '//meta[@itemprop="price"]/@content') loader.add_xpath('url', '//link[@rel="canonical"]/@href') category = response.css('.breadcrumb a::text').extract()[1:] loader.add_value('category', category) loader.add_value('brand', response.meta['brand']) loader.add_xpath('image_url', '//div/@data-original-img') loader.add_value('identifier', identifier) product = loader.load_item() if not response.css('.variations'): yield product return variations = response.xpath('//form/@data-product_variations').extract_first() variations = json.loads(variations) for variation in variations: variation_loader = ProductLoader(item=Product(product), response=response) attributes = variation['attributes'].values() variation_loader.replace_value('name', product['name']) for attribute in attributes: variation_loader.add_xpath('name', '//option[@value="%s"]/text()' %attribute) variation_loader.replace_value('price', variation['display_price']) variation_loader.replace_value('identifier', variation['variation_id']) yield variation_loader.load_item()
def parse_product(self, response): loader = ProductLoader(Product(), response=response) identifier = response.css('span#thisstkcode::text').extract_first() if not identifier: retries = response.meta.get('retries', 0) if retries > 9: self.logger.warning('No identifier found on %s' % response.url) else: self.logger.debug('Retry %s to get identifier' % response.url) meta = response.meta meta['retries'] = retries + 1 yield response.request.replace('dont_filter=True', meta=meta) loader.add_value('identifier', identifier) loader.add_value('sku', identifier) loader.add_value('url', response.url) loader.add_xpath('name', '//h1/text()') price = response.css('span.prodPrice').xpath( './/span[@itemprop="price"]/text()').extract_first() loader.add_value('price', price) category = response.css('.breadcrumbs span::text').extract()[1:] loader.add_value('category', category) loader.add_css('image_url', '.main-product-photo::attr(href)') loader.add_css('brand', 'span#thisbrand::text') loader.add_css('stock', 'input#data-stock-qty::attr(value)') yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) if hxs.select('//a[@href="#product-range"]'): for url in hxs.select( '//section[contains(@class, "product-range")]//div/a/@href' ).extract(): yield Request(urljoin(base_url, url), callback=self.parse_product) return loader = ProductLoader(item=Product(), selector=hxs) loader.add_xpath('name', '//h1[@class="fn c-both"]/text()') loader.add_xpath('price', ('//span[@class="cta now-price"]/text()', '0')) if not hxs.select('//select[@id="quantity"]'): loader.add_value('stock', 0) categories = hxs.select( '//section[@class="breadcrumbs"]//a/text()').extract()[2:-1] if 'in the kitchen' in categories: categories.remove('in the kitchen') if 'baking' in categories: categories.remove('baking') loader.add_value('category', categories) loader.add_value('brand', "Lakeland") loader.add_xpath('identifier', '//meta[@name="productcode"]/@content') loader.add_xpath('sku', '//meta[@name="productcode"]/@content') loader.add_xpath('image_url', '//img[@class="main-image"]/@src') loader.add_value('url', response.url) product = loader.load_item() if product.get('price', 30) < 30: product['shipping_cost'] = 2.99 yield product
def parse_product(self, response): brand = response.meta['brand'] brands = response.meta['brands'] loader = ProductLoader(Product(), response=response) sku_searched = response.meta['sku'] sku = response.css('.part-number strong::text').extract_first() if not sku or sku.strip().upper() != sku_searched: return product_brand = response.xpath( '//tr[th[contains(text(), "Brand")]]/td[contains(@class, "data")]/text()' ).extract()[0] if product_brand.upper().strip() not in brands: return loader.add_value('identifier', sku) loader.add_value('url', response.url) loader.add_css('name', '.product-name .h1::text') loader.add_xpath( 'price', '//span[contains(@id, "price-excluding-tax")]/text()') loader.add_value('sku', sku) category = response.css('.breadcrumbs a::text').extract()[1:] loader.add_value('category', category) loader.add_css('image_url', 'img#image-main::attr(src)') loader.add_value('brand', brand) if response.css('.availability .out-of-stock'): loader.add_value('stock', 0) item = loader.load_item() if item['price'] < 50: item['shipping_cost'] = 5 yield item
def parse_product(self, response): base_url = get_base_url(response) name = response.xpath( '//div[@class="lensname"]/h1/text()').extract()[0].strip() model_name = response.xpath( '//div[@class="lensname"]/span[@class="name-model"]/text()' ).extract() if model_name: name = name + ' ' + model_name[0] loader = ProductLoader(item=Product(), response=response) loader.add_value('name', name) price = response.xpath( '//div[@id="tiered_box_red"]//tr[td[text()="1"]]/td/strong/text()' ).extract() if not price: price = response.xpath( '//meta[@itemprop="price"]/@content').extract()[0] loader.add_value('price', price) image_url = response.xpath('//img[@itemprop="image"]/@src').extract() if image_url: loader.add_value('image_url', image_url[0]) loader.add_xpath('brand', '//meta[@itemprop="brand"]/@content') categories = response.xpath( '//div[@id="prodBreadCrumbs"]/a/text()').extract() loader.add_value('category', categories) loader.add_value('url', response.url) identifier = re.findall('productsId = "(\d+)";', response.body)[0] loader.add_value('identifier', identifier) loader.add_value('sku', identifier) yield loader.load_item()
def parse_product(self, response): data = response.xpath('//script/text()').re('{\\\\"Variants.+}')[0] data = json.loads(data.replace('\\"', '"')) variants = data['Variants'] for variant in variants: url = response.urljoin(variant['ProductPLU']) yield Request(make_variant_url(url), self.parse_product) loader = ProductLoader(item=Product(), response=response) identifier = response.xpath('//input[@id="ProductPLU"]/@value').extract_first() loader.add_value('identifier', identifier) loader.add_value('sku', identifier) loader.add_value('url', response.url) loader.add_xpath('name', '(//h1[@itemprop="name"]/text())[1]') metadata = {} for i in xrange(3): variant_name = data['Variant%dSelected' %(i+1)] if variant_name and variant_name != 'N/A': loader.add_value('name', variant_name) metadata[data['Variant%dHeader' %(i+1)]] = variant_name if 'size' in variant_name.lower(): metadata['size'] = variant_name[5:].strip() price = response.css('.price-value .currency::text').extract() loader.add_value('price', price.pop()) category = response.css('.breadcrumb a::text').extract() loader.add_value('category', category[1:]) loader.add_css('image_url', '.product-image::attr(src)') loader.add_xpath('brand', '//meta[@itemprop="brand"]/@content') loader.add_value('shipping_cost', '7.95') stock = response.css('.product-stock-widget::attr(ng-init)').re('AvailableOnline: (\w+)')[0] if stock != 'true': loader.add_value('stock', 0) item = loader.load_item() item['metadata'] = metadata yield item
def parse_product(self, response): base_sku = response.xpath('//@data-ref').extract_first() identifier = re.search('p(\d+)$', url_query_cleaner(response.url)).group(1) url = 'https://www.andrewjamesworldwide.com/ajax/get_product_options/{0}'.format( identifier) data = json.load(urlopen(url)) attributes = [attr['values'] for attr in data['attributes']] if [] in attributes: url = add_or_replace_parameter(url, 'attributes[1]', attributes[0][0]['value_id']) data = json.load(urlopen(url)) attributes = [attr['values'] for attr in data['attributes']] variants = itertools.product(*attributes) for variant in variants: url = 'https://www.andrewjamesworldwide.com/ajax/get_product_options/{0}'.format( identifier) for idx, option in enumerate(variant): url = add_or_replace_parameter( url, 'attributes[{0}]'.format(idx + 1), option['value_id']) data = json.load(urlopen(url)) selection = data['selection'].values()[0] sku = selection['reference'].strip() if not sku and base_sku not in self.skus_found: sku = base_sku if sku not in self.skus.keys(): continue if sku in self.skus_found: self.logger.info('Duplicated SKU is found: %s' % sku) self.skus_found.add(sku) loader = ProductLoader(item=Product(), response=response) loader.add_value('sku', sku) loader.add_value('identifier', selection['product_id']) loader.add_xpath('name', '//span[@id="js-product-title"]/text()') loader.add_value('name', [option['value'] for option in variant]) loader.replace_value('name', selection['title']) loader.add_value('url', response.url) loader.add_value('price', selection['price_inc']) category = response.css('div.breadcrumb a::attr(title)').extract() loader.add_value('category', category[1:]) try: image_url = [ attr['images'][0]['image'] for attr in data['attributes'][-1]['values'] ] except IndexError: image_url = response.xpath( '//div[@id="js-product-image"]//@src').extract() loader.add_value('image_url', response.urljoin(image_url[0])) loader.add_value('brand', "Andrew James") item = loader.load_item() metadata = AndrewJamesMeta() metadata['asin'] = self.skus[sku]['ASIN'] item['metadata'] = metadata yield item
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) brand = response.xpath( '//span[@itemprop="http://schema.org/manufacturer"]/text()' ).extract_first() or response.xpath( '//span[@itemprop="http://schema.org/brand"]/text()' ).extract_first() identifier = hxs.select('//input[@id="itemsArray"]/@value').extract() if not identifier: return sku = response.xpath( '//*[@itemprop="mpn"]/text()').extract()[0].strip() product_loader = ProductLoader(item=Product(), selector=hxs) image_url = response.css( 'img#productMainImage::attr(src)').extract_first() if image_url: product_loader.add_value('image_url', response.urljoin(image_url)) category = response.meta.get('category', '') if not category: category = hxs.select('//div[@id="breadcrumb"]/ul/li/a/text()' ).extract()[-2].strip() product_loader.add_value('category', category) product_name = response.xpath('//div[@id="product"]//h1//text()').re( '\S+') product_loader.add_value('name', product_name) product_loader.add_xpath('url', 'link[@rel="canonical"]/@href') product_loader.add_value('url', response.url) product_loader.add_value('identifier', identifier.pop()) product_loader.add_value('brand', brand) product_loader.add_value('sku', sku) price = ''.join( hxs.select( '//table[contains(@class, "pricing")]//td[@class="threeColTd"][1]/text()' ).extract()).strip().split('(')[0].strip().replace(u'\xa3', '') if price: price = extract_price(price) price = price.quantize(Decimal('.01')) product_loader.add_value('price', price) else: product_loader.add_value('price', 0) stock = response.css('span.availability::text').re('\d+') if stock: product_loader.add_value('stock', stock[0]) else: product_loader.add_value('stock', 0) yield product_loader.load_item()
def parse_product(self, response): try: pdata = SpiderSchema(response).get_product() except: self.logger.error('No structured product data on %s' %response.url) return options = None js_line = '' for l in response.body.split('\n'): if 'variants:' in l: js_line = l break if js_line: options = demjson.decode(re.search(r'variants:(.*};)?', js_line).groups()[0][:-2].strip()) product_loader = ProductLoader(item=Product(), response=response) sku = response.css('span.pd_productVariant::text').extract_first() product_loader.add_css('sku', 'span.pd_productVariant::text') product_loader.add_xpath('identifier', '//input[@name="productId"]/@value') product_loader.add_value('url', response.url) try: product_loader.add_value('name', pdata['name']) except KeyError: return category = response.xpath('//*[@id="breadcrumb"]//a/text()').extract()[1:-1] product_loader.add_value('category', category) img = response.xpath('//meta[@property="og:image"]/@content').extract() if img: product_loader.add_value('image_url', response.urljoin(img.pop())) price = response.xpath('//p[@class="productOfferPrice"]/text()').extract()[0] product_loader.add_value('price', price) if product_loader.get_output_value('price') < 45: product_loader.add_value('shipping_cost', '3.5') brand = response.xpath('//*[@id="brandHeader"]/a/@href').extract() if brand: brand = brand[0].replace('/en/', '')[:-1] if '/' not in brand: product_loader.add_value('brand', brand) stock = response.xpath('//link[@itemprop="availability"]/@href').extract_first() if stock != 'http://schema.org/InStock': product_loader.add_value('stock', 0) product = product_loader.load_item() yield product if options: for k, val in options.items(): option_name = k.replace('_', ' ') option_product = Product(product) option_product['name'] = product['name'] + ' ' + option_name option_product['sku'] = val['productCode'] option_product['identifier'] = val['variantId'] option_product['price'] = extract_price(val['nowPrice']) yield option_product
def parse_simple_product(self, response): loader = ProductLoader(Product(), response=response) loader.add_xpath('identifier', '//input[@name="product"]/@value') loader.add_value('url', response.url) loader.add_css('name', 'div.product-name h1::text') loader.add_css('price', 'li.bigPrice span.price::text') loader.add_xpath('sku', '//input[@name="product"]/@value') category = response.css('div.breadcrumbs a::text').extract()[1:] loader.add_value('category', category) loader.add_css('image_url', 'img#image::attr(src)') item = loader.load_item() yield item
def parse_product(self, response): if response.url.endswith('page-not-found.page'): return formdata = {} for inp in response.xpath('//form[@id="variant-form"]//input'): formdata[inp.xpath('@name').extract_first()] = inp.xpath( '@value').extract_first() if not formdata: self.logger.warning('No data on %s' % response.url) return del formdata[None] options = response.css('.vContainer .variantDataElement') for option in options: formdata[option.xpath('@name').extract_first()] = option.xpath( '@data-variant-value').extract_first() r = FormRequest.from_response( response, formxpath='//form[@id="variant-form"]', formdata=formdata, callback=self.parse_product) yield r loader = ProductLoader(item=Product(), response=response) sku = response.xpath('//input[@id="skuIdVal"]/@value').extract_first() if sku != url_query_parameter(response.url, 'skuId'): url = add_or_replace_parameter(url_query_cleaner(response.url), 'skuId', sku) yield Request(url, self.parse_product) return loader.add_value('identifier', sku) loader.add_value('url', response.url) loader.add_xpath('name', '//h1[@id="productLabel"]//text()') #loader.add_css('name', '.selected .variantDisplayName_title ::text') loader.add_css('price', '.current-price ::text') loader.add_value('sku', sku) category = response.xpath( '//div[@id="breadcrumb"]//li//span[@itemprop="title"]/text()' ).extract() loader.add_value('category', category[-4:-1]) image_url = response.xpath( '//img[@itemprop="image"]/@src').extract_first() if image_url: loader.add_value('image_url', response.urljoin(image_url)) loader.add_xpath( 'brand', '//div[@itemprop="brand"]//span[@itemprop="name"]/text()') loader.add_value('shipping_cost', 3) #if not response.css('.stock-tag.in-stock') and not response.xpath('//link[@href="http://schema.org/InStock"]') and not response.css('.available-from'): if not response.css('.add-to-basket'): loader.add_value('stock', 0) if loader.get_output_value('price'): yield loader.load_item()
def parse_frames(self, response): base_url = get_base_url(response) products = response.xpath('//tr/td[text()="Code"][1]') if products: margin = 3 else: products = response.xpath('//tr/td[span/text()="CODE"][1]') if products: margin = 2 if not products: self.log('No products found on %s' % response.url) identifiers = [] image_url = response.xpath( '//img[not (contains(@alt, "Doors"))]/@src[contains(., "images-thumb")]' ).extract() for product in products: for idx, option in enumerate( product.xpath( './../preceding-sibling::tr[1]/td[position()>1]')): name = option.xpath('.//text()').extract() for size in product.xpath('./../following-sibling::tr'): if size.xpath( 'td[(text()="Code") or (span/text()="CODE")]'): break if not size.xpath('./td[1][contains(.//text(), " x")]'): continue loader = ProductLoader(item=Product(), selector=size) loader.add_value('name', name) size_name = size.xpath('td[1]/text()').extract() loader.add_value('name', size_name) loader.add_xpath('sku', 'td[%d]/text()' % (idx * 2 + margin)) loader.add_xpath('price', 'td[%d]/text()' % (idx * 2 + margin + 1)) if not loader.get_output_value('sku'): continue identifier = loader.get_output_value( 'sku') + '-' + '-'.join(re.findall( '\d+', size_name[0])) identifier += '-' + response.url.split('/')[-1].split( '_')[0].split('.')[0] while identifier in identifiers or identifier in self.ids_seen: identifier += '-d' identifiers.append(identifier) self.ids_seen.append(identifier) loader.add_value('identifier', identifier) loader.add_value('url', response.url) if image_url: loader.add_value('image_url', urljoin(base_url, image_url[0])) yield loader.load_item()
def parse_product(self, response): loader = ProductLoader(Product(), response=response) identifier = response.xpath('//input[@name="product_id"]/@value').extract_first() loader.add_value('identifier', identifier) loader.add_value('sku', identifier) loader.add_value('url', response.url) loader.add_xpath('name', '//h1[@itemprop="name"]/text()') loader.add_xpath('price', '//meta[@itemprop="price"]/@content') category = response.xpath('//div[@id="ProductBreadcrumb"]//a/text()').extract()[1:] loader.add_value('category', category) loader.add_xpath('image_url', '//img[@itemprop="image"]/@src') loader.add_xpath('brand', '//div[@itemtype="http://schema.org/Organization"]/meta[@itemprop="name"]/@content') if not response.xpath('//link[@itemprop="availability"]/@href[contains(., "InStock")]'): loader.add_value('stock', 0) sku = identifier name = loader.get_output_value('name') name_end = re.search('\S+$', name).group(0).strip(' ()') keywords = response.xpath('//meta[@name="keywords"]/@content').extract_first().split(',') keywords = [word.strip() for word in keywords if word] shortest_keyword = min(keywords, key=len) if keywords else 'none' from_name = re.findall('\S*\d+\S*', name) if shortest_keyword.lower() == name_end.lower(): sku = name_end elif shortest_keyword.upper() == shortest_keyword: sku = shortest_keyword elif name_end.upper() == name_end: sku = name_end elif from_name: sku = max(from_name, key=len) if '(' in sku: sku = identifier loader.replace_value('sku', sku) yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) price = filter( lambda p: p.strip(), hxs.select("//span[@class='regular-price']//text()").extract())[1:] loader = ProductLoader(item=Product(), response=response) loader.add_value('price', price) loader.add_value('url', response.url) loader.add_xpath('name', "//div[@class='product-name']//h1//text()") loader.add_xpath( 'category', "//div[@class='breadcrumbs']//li[position() > 1 and position() < last()]/a/text()" ) brand = hxs.select( "//div[@class='product-shop']/div[@class='product-name']/a[@class='brand']/text()" ).extract() loader.add_value('brand', brand) loader.add_value('shipping_cost', 0) loader.add_xpath('sku', '//li/span[text()="SKU:"]/../text()') loader.add_xpath( 'identifier', "//div[@class='product-view']//input[@name='product']/@value") image_urls = hxs.select( '//img[contains(@class, "gallery-image")]/@src').extract() for image_url in image_urls: if len(image_url) < 1024: loader.add_value('image_url', image_url) break product = loader.load_item() if product['price'] > 0: yield product
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) loader = ProductLoader(item=Product(), selector=hxs) loader.add_xpath('category', '//li[@typeof="v:Breadcrumb"]/a[@href!="/"]/text()') brand = hxs.select('//script[@type="text/javascript"]/text()').re('brand: *\"(.+)\"') loader.add_value('brand', brand) loader.add_xpath('image_url', '//div[@id="amp-originalImage"]/img/@src') loader.add_value('url', url_query_cleaner(response.url)) loader.add_xpath('name', '//input[@name="speedtrapProductDisplayName"]/@value') item = loader.load_item() if hxs.select('//ul[@class="productOptionsList"]/li[contains(@class, "skuAttribute")]'): data = hxs.select('//script[contains(text(),"stockMatrix =")]/text()')[0].extract() data = data.replace('\n', '').replace('null', '"null"') data = re.search('stockMatrix = (.*?);', data, re.DOTALL) data = json.loads(data.group(1)) if data else [] for i, variant in enumerate(data): sku = [elem for elem in variant if elem.startswith('sku')][0] sku_idx = variant.index(sku) product = Product(item) product['name'] = item['name'] + ' - ' + ' '.join(variant[:sku_idx]).title() product['identifier'] = '{}-{}'.format(response.meta.get('row').get('PRODUCT_NUMBER'), i) product['sku'] = product['identifier'] product['price'] = variant[sku_idx + 2] product['stock'] = 1 if 'Available#Delivery' in variant[sku_idx + 1] else 0 yield product return loader.add_value('identifier', response.meta.get('row').get('PRODUCT_NUMBER')) loader.add_value('sku', response.meta.get('row').get('PRODUCT_NUMBER')) loader.add_xpath('price', '//input[@name="speedtrapPrice"]/@value') stock = 1 if hxs.select('//meta[@property="product:availability"]/@content[.="In Stock"]') else 0 loader.add_value('stock', stock) yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) box = hxs.select('//div[@class="prod-box"]') crumbs = hxs.select('//ul[@class="breadcrumbs"]')[0] loader = ProductLoader(selector=box, item=Product()) loader.add_value('url', response.url) brand = crumbs.select('.//a[contains(text(), "Brands")]/../following-sibling::li[1]/a/text()').extract() loader.add_value('brand', brand) categories = crumbs.select('.//a/text()').extract() categories = [cat for cat in categories if "Brand" not in cat] loader.add_value('category', categories) image_url = hxs.select('//section[@id="one"]//@src').extract() if not image_url: yield Request(response.url, callback=self.parse_category, dont_filter=True) return loader.add_value('image_url', urljoin(base_url, image_url[0])) loader.add_xpath('name', './h1/text()') loader.add_xpath('identifier', '//*/@prodref') loader.add_xpath('sku', '//*/@prodref') if not box.select('//*[text()="In Stock" or text()="Low Stock"]'): loader.add_value('stock', 0) loader.add_xpath('price', './/span[@class="product-price"]/text()') product = loader.load_item() if product['price'] < 20: product['shipping_cost'] = 2 elif product['price'] < 40: product['shipping_cost'] = 4.99 yield product
def parse_product(self, response): loader = ProductLoader(item=Product(), response=response) loader.add_xpath('url', '//link[@rel="canonical"]/@href') loader.add_xpath('name', '//span[@id="productName"]//text()') loader.add_xpath('sku', '//span[@id="productEAN"]/text()[last()]') loader.add_xpath('category', '//div[@id="breadcrumb"]/ul/li[position()>1]/a/span/text()') loader.add_css('image_url', '.productImageItem ::attr(href)') brand = response.css('.brand ::text').extract_first() if brand != "null": loader.add_value('brand', brand) item = loader.load_item() p = re.compile('stockMatrix = (.+?);', re.DOTALL) data = response.xpath('//script/text()').re(p) options = json.loads(data[0]) for option in options: loader = ProductLoader(item=Product(), response=response) loader.add_value(None, item) opt_iter = iter(option) opt_name = '' for attribute in response.css('.skuAttribute'): opt_name = opt_iter.next() loader.add_value('name', opt_name) colour_url = response.xpath('//input[@class="colourImageUrl"][@name="%s"]/@value' %opt_name).extract_first() if colour_url: loader.replace_value('image_url', 'http://media.littlewoods.com/i/littlewoods/%s?$1064x1416_standard$' %colour_url) loader.replace_value('identifier', opt_iter.next()) stock = opt_iter.next() if stock.startswith('Unavailable'): continue loader.replace_value('stock', int('Out of stock' not in stock)) loader.replace_value('price', opt_iter.next()) yield loader.load_item()