def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) url = urljoin_rfc(base_url, response.url) image_url = hxs.select('//img[@id="product-image-main"]/@src').extract() product_name = hxs.select('//*[@id="product-header"]//h1/text()').extract() if product_name: product_name = product_name[0].strip() else: log.msg('Skips product without name: ' + response.url) return category = hxs.select('//div[@class="crumbs"]/span/a/span/text()').extract()[-1] brand = hxs.select('//*[@id="product-header"]/a/img/@alt').extract() brand = brand[0] if brand else '' options = hxs.select('//table[@class="child-list with-hover"][1]/tbody/tr') if options: for option in options: columns = option.select('./td') name = '' sku = '' get_name = 1 in_stock = 1 identifier = '' for column in columns: ctype = column.select('./@class').extract()[0] if ctype == 'code': get_name = 0 name = product_name + name sku = column.select('./text()').extract()[0] if get_name: name += ' - ' + column.select('./text()').extract()[0] if ctype == 'price': price = column.select('.//input/@value').extract()[-1] price = extract_price(price) if ctype == 'status out-of-stock': in_stock = 0 identifier = sku loader = ProductLoader(item=Product(), selector=option) loader.add_value('identifier', identifier) loader.add_value('url', url) colour = hxs.select('//li[.//td[text()="'+sku+'"]]/div[contains(@class, "colour")]/p/text()').extract() if colour: name = name + ' ' + colour[0] loader.add_value('name', name) if image_url: loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) loader.add_value('price', price) loader.add_value('sku', sku) loader.add_value('brand', brand) loader.add_value('category', category) if not in_stock: loader.add_value('stock', 0) if price <= 49.99: loader.add_value('shipping_cost', 3.95) else: loader.add_value('shipping_cost', 0) yield loader.load_item() else: options = hxs.select('//div[@class="product-options"]//option[not(@title="Not Selected")]') if options: try: options_mappings = json.loads(re.findall(re.compile("childMap\': (\{.+?}),\n"), response.body)[0]) options_prices = json.loads(re.findall(re.compile("prices\': (\{.+?}),\n"), response.body)[0]) options_skus = json.loads(re.findall(re.compile("skus\': (\{.+?}),\n"), response.body)[0]) options_stocks = json.loads(re.findall(re.compile("stockStatuses\': (\{.+?}),\n"), response.body)[0]) except: return for option in options: loader = ProductLoader(item=Product(), selector=hxs) option_name = product_name + ' ' + option.select("./@title").extract()[0] option_id = option.select("./@value").extract()[0] option_mapping = str(options_mappings[option_id]) option_price = extract_price(str(options_prices[option_mapping][0]['purchase'])) option_sku = options_skus[option_mapping] option_stock = 1 if not 'Out' in options_stocks[option_mapping] else 0 loader.add_value('identifier', option_sku) loader.add_value('sku', option_sku) loader.add_value('url', url) loader.add_value('name', option_name) loader.add_value('price', option_price) loader.add_value('brand', brand) loader.add_value('category', category) loader.add_value('stock', option_stock) if image_url: loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) if option_price < 49.99: loader.add_value('shipping_cost', 3.95) else: loader.add_value('shipping_cost', 0) yield loader.load_item() else: loader = ProductLoader(item=Product(), selector=hxs) sku = hxs.select('//div[@class="title"]//p/text()').extract()[0] sku = sku.replace('Product Code: P', '') identifier = sku loader.add_value('identifier', identifier) loader.add_value('sku', sku) loader.add_value('url', url) loader.add_value('name', product_name) if image_url: loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) price = hxs.select('//*[@id="product-price"]//input/@value').extract()[0] price = extract_price(price) loader.add_value('price', price) loader.add_value('brand', brand) loader.add_value('category', category) in_stock = hxs.select('//*[@id="product-stock"]/text()').extract()[0] if in_stock != 'In stock': loader.add_value('stock', 0) if price < 49.99: loader.add_value('shipping_cost', 3.95) else: loader.add_value('shipping_cost', 0) yield loader.load_item()
def parse_products(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) category = response.meta.get('category', '') for url in hxs.select('//div[@class="pages"]//a/@href').extract(): yield Request(url, callback=self.parse_products, meta=response.meta) products = hxs.select('//li[contains(@class, "item")]') for product in products: loader = ProductLoader(item=Product(), selector=product) try: model = map( unicode.strip, product.select('.//p[contains(text(), "model: ")]/text()'). re(r'model: (.*)'))[0] except: model = '' name = product.select( './/h2[@class="product-name"]/a/text()').extract() if name: name = name[0].strip() else: name = '' loader.add_value('name', ' '.join((name, model))) url = product.select( './/h2[@class="product-name"]/a/@href').extract()[0].strip() identifier = product.select( './/span[contains(@id, "product-price-")]/@id').re( r'product-price-(\d+)') if not identifier: identifier = product.select( './/ul[@class="add-to-links"]/li/a[@class="link-compare" or @class="link-wishlist"]/@href' ).re('product/(.*?)/') if identifier: prod_id = identifier[0] loader.add_value('identifier', prod_id) loader.add_value('url', url.split('?')[0]) try: brand = map( unicode.strip, product.select( './/p[contains(text(), "manufacturer: ")]/text()').re( r'manufacturer: (.*)'))[0] except: brand = product.select('td[3]//text()').extract() loader.add_value('brand', brand) if model: loader.add_value('sku', model) image_url = product.select( './/a[@class="product-image"]/img/@src').extract() if image_url: loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) try: price = product.select( './/span[contains(@id, "product-price-")]/span[@class="price"]/text()' ).extract()[0].strip() except: try: price = product.select( './/span[contains(@id, "product-price-") and contains(@class, "price")]/text()' ).extract()[0].strip() except: price = '0.0' loader.add_value('price', price) loader.add_value('category', category) if loader.get_collected_values( 'identifier') and loader.get_collected_values( 'identifier')[0]: product = loader.load_item() if product['price'] > 0: yield product else: self.log('IDENTIFIER NOT FOUND!!! {}'.format( loader.get_output_value('url')))
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) loader = ProductLoader(item=Product(), response=response) name = response.css('.product-name').xpath('h1/text()').extract_first() loader.add_value('name', name) loader.add_value('url', response.url) sname = name.lower() for brand in self.brands: if sname.startswith(brand): loader.add_value('brand', brand.title()) break categories = response.css('.breadcrumbs').xpath( './/a/span/text()').extract()[1:] loader.add_value('category', categories) sku = hxs.select( '//*[@id="product_addtocart_form"]//div[@class="expert-notes "]//span[contains(text(), "SKU: ")]/text()' ).extract() if sku: sku = sku[0].replace("SKU: ", '') else: sku = '' loader.add_value('sku', sku) identifier = hxs.select('//input[@name="product"]/@value').extract()[0] loader.add_value('identifier', identifier + '-new') image_url = hxs.select('//img[@id="image-main"]/@src').extract() if image_url: loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) price = response.xpath('//script/text()').re('price":"(.+?)"') price = extract_price(price[0]) if price else 0 loader.add_value('price', price) in_stock = hxs.select( '//div[@class="availability in-stock"]//div[@class="value" and contains(text(), "In stock")]' ) if not in_stock: in_stock = hxs.select( '//p[@class="availability back-order"]//span[@class="value" and contains(text(), "Back Order")]' ) if not in_stock: loader.add_value('stock', 0) if loader.get_output_value('price') < 100: loader.add_value('shipping_cost', 6.50) item = loader.load_item() options_config = re.search( r'var spConfig = new Product.Config\((.*)\)', response.body) if options_config: product_data = json.loads(options_config.groups()[0]) products = {} prices = {} for attr in product_data['attributes'].itervalues(): for option in attr['options']: for product in option['products']: products[product] = ' - '.join( (products.get(product, ''), option['label'])) prices[product] = prices.get( product, 0) + extract_price(option['price']) base_price = extract_price(product_data['basePrice']) for option_identifier, option_name in products.iteritems(): option_item = deepcopy(item) option_item['identifier'] += '-' + option_identifier option_item['name'] += option_name option_item['price'] = base_price + prices[option_identifier] yield option_item else: yield item
def parse_product(self, response): hxs = HtmlXPathSelector(response) identifier = hxs.select( u'//form[@name="orderform"]/input[@name="productid"]/@value' ).extract() if identifier: product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_value('url', response.url) product_loader.add_xpath('name', u'//h1[@class="dialog_title"]/text()') product_loader.add_xpath('sku', u'//span[@class="sku-code"]/text()') product_loader.add_value('identifier', identifier) product_loader.add_xpath('price', u'//span[@id="product_price"]/text()') product_loader.add_value('category', response.meta.get('category')) options = [] names = {} for line in response.body.split('\n'): m = re.search( 'variants\[.*\] = \[\[([\d\.,]+),\d+,new Image.*\'([^\']+)\'', line) if m: g = m.groups() options.append([g[0], g[1], []]) continue m = re.search('variants\[.* = (.+);', line) if m: g = m.groups() options[-1][2].append(g[0]) continue m = re.search('names.*\[([^\]]+)\] = "(.+)";', line) if m: g = m.groups() names[g[0]] = g[1] continue product_loader.add_xpath( 'brand', u'normalize-space(//div[contains(@class, "order-info")]/div/a/@title)' ) try: img = hxs.select('//img[@itemprop="image"]/@src').extract()[0] product_loader.add_value( 'image_url', urljoin_rfc(get_base_url(response), img)) except: pass product = product_loader.load_item() if options: for price, sku, ids in options: prod = Product(product) prod['name'] = prod['name'] + ' (' + ' '.join( [names[id] for id in ids]) + ')' prod['sku'] = sku prod['identifier'] = prod['identifier'] + ':' + '.'.join( ids) prod['price'] = Decimal(price) yield prod else: yield product
def parse_product(response): hxs = HtmlXPathSelector(response) opt_groups = [] inside = False lst = '' for line in response.body.split('\n'): if line.startswith('perms[\''): inside = True lst = '' elif line.startswith('];'): if lst: opts = eval('[' + lst + ']') # XXX http://www.thesleepshop.co.uk/acatalog/4ft6_Double_Kyoto_Memphis_Futon.html#a11717 # second option has "Deluxe Mattress" twice with different additional price # however price calculation ignores second addition price (uses first value) filtered_opts = [] for price, name in opts: if not [name for pn in filtered_opts if pn[1] == name]: filtered_opts.append([price, name]) opt_groups.append(filtered_opts) inside = False elif inside: lst += line identifier = hxs.select( '//form//input[contains(@name, "Q_")]/@name').re(r'Q_(.*)$')[0] product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_value('url', response.url) product_loader.add_xpath('name', u'//h3[@class="product"]/text()') product_loader.add_xpath('name', u'//span[@class="product"]/text()') product_loader.add_value('sku', identifier) product_loader.add_value('identifier', identifier) product_loader.add_value('category', response.meta.get('category')) product_loader.add_xpath( 'price', u'//div[@align="left"]/div/div[contains(text(),"Now")]/text()') if not product_loader.get_output_value('price'): product_loader.add_xpath( 'price', u'//div[@id="price_inside"]//span//text()') if not product_loader.get_output_value('price'): product_loader.add_xpath( 'price', u'//div[@id="price_inside"]//span/@ppraw') if not product_loader.get_output_value('price'): product_loader.add_value('price', '') img = hxs.select( u'//div[@class="slides_control"]/a/img/@src').extract() if not img: img = hxs.select( u'//div[@class="image_product"]//img/@src').extract() product_loader.add_value('image_url', urljoin_rfc(get_base_url(response), img[0])) brand_logo = hxs.select( u'//h3[@class="product"]/../img/@src').extract() if not brand_logo: brand_logo = hxs.select( u'//h3[@class="product"]/img/@src').extract() brands = { '6thsense.jpg': '6th sense', 'bentley.gif': 'bentley', 'birlea.gif': 'birlea', 'blank.gif': '', 'brand': '', 'Breasley.gif': 'breasley', 'buoyant.jpg': 'buoyant', 'cro.gif': 'cro', 'cumfilux.gif': 'cumfilux', 'dt.gif': 'dt', 'dunlopillo.gif': 'dunlopillo', 'durabeds.gif': 'durabeds', 'easycomfort.gif': 'easy comfort', 'friendship_mill.gif': 'friendship mill', 'Furmanac.gif': 'furmanac', 'gainsborough.gif': 'fainsborough', 'gleneagle.gif': 'gleneagle', 'harlequin.gif': 'harlequin', 'harmony.gif': 'harmony', 'healthbeds.gif': 'healt beds', 'highgate.gif': 'highgate', 'hypnos.gif': 'hypnos', 'jay-be.gif': 'jay be', 'julianbowenlogo.jpg': 'julian bowen', 'kaymed.gif': 'kaymed', 'komfi.gif': 'komfi', 'kyoto.gif': 'kyoto', 'limelight.gif': 'limelight', 'metalbeds.gif': 'metalbeds', 'millbrook.gif': 'millbrook', 'myers.gif': 'myers', 'nd.gif': 'newdesign', 'nestledown.gif': 'nestledown', 'obc.gif': 'original bedstead', 'Protectabed.gif': 'protectabed', 'rauch.gif': 'rauch', 'relaxsan.gif': 'relaxsan', 'relyon.gif': 'relyon', 'rest_assured.gif': 'rest assured', 'richman.gif': 'richman', 'sealy.gif': 'sealy', 'shakespeare.gif': 'shakespeare', 'silentnight.gif': 'silentnight', 'sleepeezee.gif': 'sleepeezee', 'sleepshaper.gif': 'sleepshaper', 'sleepyvalley.gif': 'sleepyvalley', 'slumberland.gif': 'slumberland', 'staples.gif': 'staples', 'steens.gif': 'steens', 'swanglen.gif': 'swanglen', 'sweetdreams.gif': 'sweetdreams', 'tss.gif': 'the sleep shop', 'verona.jpg': 'verona', 'welcome.gif': 'welcome furniture', } product_loader.add_value( 'brand', brands.get(brand_logo[0], remove_extension(brand_logo[0]))) product = product_loader.load_item() for opt_price, opt_name in multiply(opt_groups): prod = Product(product) prod['name'] = (prod['name'] + ' ' + opt_name).strip() try: prod['price'] = (Decimal(prod['price']) + Decimal(opt_price)).quantize(Decimal('1.00')) except TypeError: prod['price'] = Decimal(0) prod['identifier'] = prod['identifier'] + ':' + opt_name yield prod
class RDGToolsSpider(BaseSpider): name = 'rdgtools.co.uk' allowed_domains = ['rdgtools.co.uk', 'www.rdgtools.co.uk'] start_urls = (u'http://www.rdgtools.co.uk/',) def _start_requests(self): yield Request('http://www.rdgtools.co.uk/acatalog/Proxxon-Drilling---Grinding-Bits.html', callback=self.parse_product) def parse(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) parsed_ids = [] cats = hxs.select('//*[@id="sidebar"]//li[@class="sections-list"]/a/@href').extract() if cats: for cat in cats: yield Request( url=urljoin_rfc(base_url, cat) ) subcats = hxs.select('//*[@id="ContentPage"]//span[@class="boxheading"]/a/@href').extract() if subcats: for subcat in subcats: yield Request( url=urljoin_rfc(base_url, subcat) ) for url in hxs.select('//div[@id="content"]//form//a[@class="read-more"]/@href').extract(): if url: try: pid = url.split('.')[-2].split('-')[-1] if pid not in parsed_ids: parsed_ids.append(pid) yield Request(urljoin_rfc(base_url, url), callback=self.parse_product) except: pass for url in hxs.select('//div[@id="content"]//form//h1/../../a/@href').extract(): if url: try: pid = url.split('.')[-2].split('-')[-1] if pid not in parsed_ids: parsed_ids.append(pid) yield Request(urljoin_rfc(base_url, url), callback=self.parse_product) except: pass category = hxs.select('//*[@id="ContentPage"]/p[@class="text_breadcrumbs"]/a/text()').extract()[1:] for product in hxs.select('//*[@id="ContentPage"]//div[@class="col-xs-12"]'): pid = product.select('.//input[contains(@name, "Q_")]/@name').re(r'Q_(.+)') if pid: pid = pid[0] if pid not in parsed_ids: parsed_ids.append(pid) name = ''.join(product.select('.//h1//text()').extract()) url = response.url image_url = product.select('.//img[@class="catalog-image"]/@src').extract() image_url = urljoin_rfc(base_url, image_url[0]) if image_url else '' price = product.select('.//span[@class="catalog-price" or @class="product-price"]/text()').re(u'\xa3([\d\.,]+)') price = price[0] loader = ProductLoader(item=Product(), selector=product) loader.add_value('url', url) loader.add_value('name', name) loader.add_value('sku', pid) loader.add_value('identifier', pid) loader.add_value('category', category) loader.add_value('image_url', image_url) loader.add_value('price', price) yield loader.load_item() def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) category = hxs.select('//*[@id="idBreadcrumbsTop"]/p[@class="text_breadcrumbs"]/a/text()').extract()[1:] url = response.url try: sku = hxs.select('//input[contains(@name, "Q_")]/@name').re(r'Q_(.+)')[0] except Exception, e: self.log('NO SKU %s' % url) return names = hxs.select(u'//div[@id="product-page-body"]//h1/text()').extract() name = ' '.join(names) try: image_url = hxs.select(u'//div[@id="product-page-body"]//img[@class="img-responsive catalog-image"]/@src').extract()[0] image_url = urljoin_rfc(base_url, image_url) except: image_url = u'' price = hxs.select('//div[@id="product-page-body"]//span[@class="catalog-price"]/text()').re(u'\xa3([\d\.,]+)') if not price: price = hxs.select('//div[@id="product-page-body"]//span[@class="product-price"]/text()').re(u'\xa3([\d\.,]+)') try: price = price[0] except: log.msg(">>> WARNING!!! NO PRICE >>> %s >>> %s" % (name, url)) price = 0 loader = ProductLoader(item=Product(), selector=hxs) loader.add_value('url', url) loader.add_value('name', name) loader.add_value('sku', sku) loader.add_value('identifier', sku) loader.add_value('category', category) loader.add_value('image_url', image_url) loader.add_value('price', price) yield loader.load_item()
def parse_product(response): hxs = HtmlXPathSelector(response) product_name = ''.join( hxs.select('//div[@class="web_pro_detail_title"]/h1/text()'). extract()).strip() identifier = hxs.select( '//*[@id="product_addtocart_form"]//input[@name="product"]/@value' ).extract()[0] sku = hxs.select( '//span[@class="sku_block"]/text()').extract()[0].strip() img = hxs.select('//div[@class="pro_img"]//img/@src').extract() category = response.meta.get('category') price = hxs.select('//span[@id="product-price-{}"]/text()'.format( identifier)).extract() if not price: price = hxs.select( '//*[@id="product_addtocart_form"]//span[@class="price"]' ).extract() price = extract_price(price[0]) sizes = hxs.select( '//select[@class=" product-custom-option sizecc"]/option') if len(sizes) > 1: size_variations = [] for size in sizes[1:]: size_id = size.select('./@value').extract()[0] size_name = size.select('./text()').extract()[0] size_variations.append([size_id, size_name]) colors = hxs.select('//div[@id="colour_options_hidden"]//img') color_variations = [] for color in colors: color_id = color.select('./@valueid').extract()[0] color_name = color.select('./@val').extract()[0] color_variations.append([color_id, color_name]) options = itertools.product(size_variations, color_variations) for option in options: product_identifier = identifier + '_' + option[0][ 0] + '_' + option[1][0] size_name = option[0][1] result = re.findall(r"(?sim)\( \+\$([\d.]+)\)", size_name) if result: add_price = extract_price(result[0]) size_name = size_name.replace('( +${})', '').strip() else: add_price = extract_price('0') name = product_name + ' ' + size_name + ' ' + option[1][1] loader = ProductLoader(item=Product(), selector=hxs) loader.add_value('identifier', product_identifier) loader.add_value('sku', sku) loader.add_value('url', response.url) loader.add_value('name', name) loader.add_value('price', price + add_price) if img: loader.add_value( 'image_url', urljoin_rfc(get_base_url(response), img[0])) loader.add_value('category', category) yield loader.load_item() else: colors = hxs.select('//div[@id="colour_options_hidden"]//img') if colors: for color in colors: color_id = color.select('./@valueid').extract()[0] color_name = color.select('./@val').extract()[0] product_identifier = identifier + color_id name = product_name + ' ' + color_name loader = ProductLoader(item=Product(), selector=hxs) loader.add_value('identifier', product_identifier) loader.add_value('sku', sku) loader.add_value('url', response.url) loader.add_value('name', name) loader.add_value('price', price) if img: loader.add_value( 'image_url', urljoin_rfc(get_base_url(response), img[0])) loader.add_value('category', category) yield loader.load_item() else: loader = ProductLoader(item=Product(), selector=hxs) loader.add_value('identifier', identifier) loader.add_value('sku', sku) loader.add_value('url', response.url) loader.add_value('name', product_name) loader.add_value('price', price) if img: loader.add_value( 'image_url', urljoin_rfc(get_base_url(response), img[0])) loader.add_value('category', category) yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) image_url = hxs.select( '//span[@class="mainimage"]//img/@src').extract() image_url = urljoin_rfc(base_url, image_url[0]) if image_url else '' product_identifier = hxs.select('//script/text()').re( 'var product_id *= *(.+);') if not product_identifier: yield Request(response.url, dont_filter=True) return product_identifier = product_identifier[0] product_name = hxs.select( '//h1[@itemprop="name"]/text()').extract()[0].strip() category = hxs.select( '//div[@id="breadcrumbs"]//a/text()').extract()[1:] brand = re.findall("'brand': '(.*)',", response.body) brand = brand[0].strip() if brand else '' product_price = hxs.select('//script/text()').re( "'price' *: *'(.+?)'")[0] product_price = extract_price(product_price) sku = hxs.select('//span[@class="mpn"]//text()').re( 'Product code: *(.+)') options = [] product_options = hxs.select('//div[@class="ctaselector"]') if product_options: for select in product_options: values = select.select('.//li/a/@id').extract() titles = select.select('.//li/a/span/text()').extract() opts = [] for value, title in zip(values, titles): opts.append({'identifier': value, 'name': title}) if opts: options.append(opts) if options: for opts in itertools.product(*options): name = product_name identifier = product_identifier for option in opts: name += ' ' + option['name'] identifier += '_' + option['identifier'] product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_value('identifier', identifier) product_loader.add_value('name', name) if image_url: product_loader.add_value('image_url', image_url) product_loader.add_value('price', product_price) if product_loader.get_output_value('price') < 50: product_loader.add_value('shipping_cost', 3.95) product_loader.add_value('url', response.url) product_loader.add_value('brand', brand) product_loader.add_value('sku', sku) product_loader.add_value('category', category) product = product_loader.load_item() yield product else: product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_value('identifier', product_identifier) product_loader.add_value('name', product_name) if image_url: product_loader.add_value('image_url', image_url) product_loader.add_value('price', product_price) if product_loader.get_output_value('price') < 50: product_loader.add_value('shipping_cost', 3.95) product_loader.add_value('url', response.url) product_loader.add_value('brand', brand) product_loader.add_value('sku', sku) product_loader.add_value('category', category) product = product_loader.load_item() yield product
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) image_url = hxs.select('//*[@id="zoomImage"]/@src').extract() product_identifier = hxs.select( '//input[@name="shopListrec"]/@value').extract()[0].strip() product_name = hxs.select( '//h1[contains(@class, "prodTitle")]/text()').extract()[0] category = hxs.select( '//p[@class="crumbsDN"]//a/span/text()').extract() brand = response.meta.get('brand') price = Decimal( response.xpath( '//meta[@itemprop="price"]/@content').extract_first()) sku = hxs.select('//span[@itemprop="productID"]/text()').extract()[0] out_of_stock = hxs.select('//*[@id="errorMsg"]/text()').extract() options = hxs.select('//*[@id="options"]//select/option') if len(options) > 1: self.log('More options!! {}'.format(response.url)) if options: for option in options[1:]: identifier = option.select('./@value').extract()[0] option_name = option.select('./text()').extract()[0].split( '-')[0].strip() out_of_stock = option.select('./@disabled').extract() product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_value('identifier', product_identifier + '_' + identifier) product_loader.add_value('name', product_name + ' ' + option_name) if image_url: product_loader.add_value( 'image_url', urljoin_rfc(base_url, image_url[0])) product_loader.add_value('price', price) product_loader.add_value('url', response.url) product_loader.add_value('brand', brand) product_loader.add_value('category', category) product_loader.add_value('sku', sku) if price < 75: product_loader.add_value('shipping_cost', 3.99) if out_of_stock: product_loader.add_value('stock', 0) product = product_loader.load_item() yield product else: product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_value('identifier', product_identifier) product_loader.add_value('name', product_name) if image_url: product_loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) product_loader.add_value('price', price) product_loader.add_value('url', response.url) product_loader.add_value('brand', brand) product_loader.add_value('category', category) product_loader.add_value('sku', sku) if price < 75: product_loader.add_value('shipping_cost', 3.99) if out_of_stock and out_of_stock == 'OUT OF STOCK': product_loader.add_value('stock', 0) product = product_loader.load_item() yield product
def parse(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) for url in hxs.select('//*[@id="modelLevelHub"]/li/a/@href').extract(): yield Request(urljoin_rfc(base_url, url), callback=self.parse_products)
def parse_product(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) for url in hxs.select( '//div[@class="stretch clearfix box"]/select/option/@value' ).extract(): yield Request(urljoin_rfc(base_url, url), callback=self.parse_product, meta={'dont_redirect': True}) loader = ProductLoader(item=Product(), selector=hxs) name = hxs.select( '//*[@id="centerC"]/h1/span[@itemprop="name"]/text()').extract()[0] loader.add_value('name', name) identifier = hxs.select( '//div[@class="pd-container-right"]//form[@class="addBasketItem"]//input[@name="productId"]/@value' ).extract() if not identifier: return loader.add_value('identifier', identifier[0]) loader.add_value('url', response.url) price = hxs.select('//noscript/span/text()').extract() price = extract_price(price[0]) if price else '0' loader.add_value('price', price) stock = hxs.select('//*[@id="first3"]/p/span/text()').extract() stock = stock[0] if stock else '' categories = hxs.select( '//*[@id="infoblock"]/div/a/text()').extract()[1:] for category in categories: loader.add_value('category', category) brand = hxs.select('//div[@class="pd-brand box"]/a/img/@alt').extract() brand = brand[0] if brand else '' loader.add_value('brand', brand) image_url = hxs.select('//*[@id="showPic"]/@src').extract() image_url = urljoin_rfc(base_url, image_url[0]) if image_url else '' loader.add_value('image_url', image_url) product = loader.load_item() promotion_price = hxs.select( u'//p[contains(text(), "Preço Regular")]/strike/text()').re( r'[\d,.]+') metadata = SonaeMeta() metadata['exclusive_online'] = 'No' if promotion_price: metadata['promotion_price'] = promotion_price[0].replace( '.', '').replace(',', '.') metadata['stock'] = stock if self.meta_df is not None and not self.meta_df.empty and identifier[ 0] in self.meta_df.index: prev_meta = self.meta_df.loc[identifier[0]] else: prev_meta = {} promo = promotion_price promo_start = prev_meta.get('promo_start') promo_end = prev_meta.get('promo_end') today = datetime.datetime.now().strftime('%Y-%m-%d') metadata['extraction_timestamp'] = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M') if promo: metadata[ 'promo_start'] = promo_start if promo_start and not promo_end else today metadata['promo_end'] = '' else: if promo_start: metadata['promo_start'] = promo_start metadata['promo_end'] = today if not promo_end else promo_end product['metadata'] = metadata shipping_pid = hxs.select( '//span[@id="shipmentDetails"]/@data-productid').extract() if shipping_pid: shipping_url = 'https://www.redcoon.pt/req/ajax/mod/ShopShipment/pid/' + shipping_pid[ 0] headers = { 'X-Requested-With': 'XMLHttpRequest', } yield Request(shipping_url, headers=headers, callback=self.parse_shipping, meta={'product': product}) else: yield product
def parse_item(self, response): for match in re.finditer(r'<a class=\\"hint\\" title=\\"Weitere Informationen zum Produkt\\" href=\\"\\/(.*?)\\"', response.body): url = match.group(1) yield Request(urljoin_rfc('http://www.cyberport.de/', url), callback=self.parse_product)
def parse_product_list(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) products = hxs.select('//div[contains(@class, "product producttile")]') for product in products: product_loader = ProductLoader(item=Product(), selector=product) product_name = product.select( './/div[@class="name"]/a/text()').extract() if not product_name: continue else: product_name = product_name[0].strip() image_url = product.select( './/img[@id="firimg"]/@src').extract()[0] product_loader.add_value('image_url', urljoin_rfc(base_url, image_url)) product_loader.add_value('name', product_name) url = product.select('.//div[@class="name"]/a/@href').extract()[0] product_loader.add_value('url', url) match = re.search(r"(\d+)\.html", url) if match: identifier = match.group(1) if identifier in self.ids: continue else: self.ids.append(identifier) else: continue product_loader.add_value('identifier', identifier) product_loader.add_value('sku', identifier) price = product.select( './/div[@class="salesprice"]/text()').extract()[0] product_loader.add_value('price', extract_price(price)) category = response.meta.get('category', '') if not category or response.meta.get('full'): category2 = product.select( './/div[@class="capacityType"]/text()').extract() if category2: category2 = category2[0].split(u'\u2022') if len(category2) > 1: category2 = category2[1] else: category2 = category2[0] if category2.strip(): category = category2 if category: # Diageo have requested that we group all categories with 'Whisky' or 'Whiskey' in the name into one category named 'Whisky' # https://www.assembla.com/spaces/competitormonitor/tickets/2254 if 'whisky' in category.lower() or 'whiskey' in category.lower( ): category = 'Whisky' product_loader.add_value('category', category.strip()) for brand in self.brands: if brand in product_name.replace('A&J', 'Alexander & James').replace( u'C\xeeroc', 'Ciroc'): product_loader.add_value('brand', brand) break product = product_loader.load_item() self.jar_counter += 1 yield Request(url, callback=self.parse_product, meta={ 'product': product, 'cookiejar': self.jar_counter }, cookies={})
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) image_url = hxs.select('//*[@id="image"]/@src').extract() try: product_identifier = hxs.select( '//input[@name="product"]/@value').extract()[0].strip() except: product_identifier = hxs.select( '//form[@id="product_addtocart_form"]/@action').re( r'/product/(\d+)')[0] product_name = hxs.select('//*[@id="productname"]/text()').extract()[0] category = hxs.select( '//div[@class="breadcrumbs"]//a/text()').extract()[1:] sku = product_identifier options_config = re.search( r'var spConfig = new Product.Config\((.*)\)', response.body) if options_config: product_data = json.loads(options_config.groups()[0]) products = {} attributes = {} for attr_id, attr in product_data['attributes'].iteritems(): for option in attr['options']: for product in option['products']: products[product] = ' '.join( (products.get(product, ''), option['label'])) attributes.setdefault(product, []).append({ 'attr_id': attr_id, 'val': option['id'] }) for identifier, option_name in products.iteritems(): product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_value('identifier', product_identifier + '_' + identifier) product_loader.add_value('name', product_name + option_name) if image_url: product_loader.add_value( 'image_url', urljoin_rfc(base_url, image_url[0])) product_loader.add_value('url', response.url) product_loader.add_value('category', category) product_loader.add_value('sku', sku) product_loader.add_value('brand', response.meta.get('brand')) product = product_loader.load_item() form_data = {'product': product_identifier, 'billing_qty': '1'} for attr in attributes[identifier]: form_data['super_attribute[{}]'.format( attr['attr_id'])] = str(attr['val']) yield FormRequest( url='http://www.musclefood.com/billing/ajax/servingsinfo/', formdata=form_data, meta={'product': product}, callback=self.parse_price, dont_filter=True) else: product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_value('identifier', product_identifier) product_loader.add_value('name', product_name) if image_url: product_loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) price = ''.join( hxs.select('//span[@class="price"]/text()').extract()).strip() price = extract_price(price) product_loader.add_value('price', price) product_loader.add_value('url', response.url) product_loader.add_value('category', category) product_loader.add_value('sku', sku) product_loader.add_value('brand', response.meta.get('brand')) if price < 75: product_loader.add_value('shipping_cost', 3.95) product = product_loader.load_item() yield product
def parse_product(response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) discount = url_query_parameter(response.url, 'qbDiscount') match = re.search("var productData = (.*?)</script>", response.body, re.DOTALL | re.IGNORECASE) if match: result = match.group(1) prod_name = hxs.select( '//*[@id="wrapper_page_content"]//h1/text()').extract()[0] sku = hxs.select( '//*[@id="product_tab_1"]//li[@class="product_code"]/span/text()' ).extract()[0] image_url = hxs.select( '//*[@id="product_view_full"]/@href').extract() category = hxs.select( '//*[@id="nav_breadcrumb"]//a/span/text()').extract()[1:] product_identifier = hxs.select( '//*[@id="productId"]/@value').extract()[0] options_prices = demjson.decode(result) options_prices = options_prices['items'] options = hxs.select('//*[@id="product_size_full"]/option')[1:] for option, price in zip(options, options_prices): product_loader = ProductLoader(item=Product(), selector=hxs) name = option.select('./text()').extract()[0] identifier = option.select('./@value').extract()[0] product_loader.add_value('identifier', product_identifier + '_' + identifier) product_loader.add_value('name', prod_name + ' ' + name) product_loader.add_value('sku', sku) if image_url: product_loader.add_value( 'image_url', urljoin_rfc(base_url, image_url[0])) price = price['nowprice'] price = extract_price(str(price)) if discount: price = round( price - decimal.Decimal(int(discount) / 100.0) * price, 2) product_loader.add_value('price', price) product_loader.add_value( 'category', transform_category(product_loader.get_output_value('name'), category)) product_loader.add_value('url', response.url) out_of_stock = option.select('./@disabled').extract() if out_of_stock: product_loader.add_value('stock', 0) product = product_loader.load_item() yield product else: product_loader = ProductLoader(item=Product(), selector=hxs) name = hxs.select( '//*[@id="wrapper_page_content"]//h1/text()').extract() if not name: return identifier = hxs.select('//*[@id="productId"]/@value').extract() if not identifier: return product_loader.add_value('identifier', identifier[0]) product_loader.add_value('name', name[0]) sku = hxs.select( '//*[@id="product_tab_1"]//li[@class="product_code"]/span/text()' ).extract()[0] product_loader.add_value('sku', sku) image_url = hxs.select( '//*[@id="product_view_full"]/@href').extract() if image_url: product_loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) price = hxs.select( '//ul[@class="product_summary"]/li[@class="product_price"]/span/text()' ).extract()[0] price = extract_price(price) if discount: price = round( price - decimal.Decimal(int(discount) / 100.0) * price, 2) product_loader.add_value('price', price) category = hxs.select( '//*[@id="nav_breadcrumb"]//a/span/text()').extract()[1:] product_loader.add_value( 'category', transform_category(product_loader.get_output_value('name'), category)) product_loader.add_value('url', response.url) product = product_loader.load_item() yield product
def parse(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) for url in hxs.select('//figure[@class="produit-image"]/a/@href').extract(): yield Request(urljoin_rfc(base_url, url), callback=self.parse_product)
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) product_identifier = hxs.select('//@prodid').extract() if product_identifier: product_identifier = product_identifier[0].strip() else: self.retry(response, "Cant find identifier on " + response.url) return image_url = hxs.select( '//div[@class="prod-images-container"]//img/@src').extract() product_name = hxs.select( '//div[@class="product-name"]//h1/text()').extract()[0].strip() category = hxs.select('//*[@id="bcrumb"]/a[2]/text()').extract() category = category[0].strip() if category else '' brand = hxs.select( '//div[contains(@class, "product-brand-icon")]/a/img/@alt' ).extract() brand = brand[0].strip() if brand else '' #uname = hxs.select('//input[@id="hdnUName"]/@value').extract()[0] option_title = response.css('.prod-selector-section-hdr').xpath( 'text()').extract_first() size_or_color = 'Color' in option_title or 'Size' in option_title options = response.xpath('//div[@id="dropdownSkuCtrl_nojs"]//option') js_options = response.xpath( '//script[contains(., "arrSku")]/text()').re('({.+?});') js_options = { demjson.decode(opt)['skuID']: demjson.decode(opt) for opt in js_options } for option in options: product_loader = ProductLoader(item=Product(), selector=option) sku = option.select('./@skucode').extract() if not sku: continue sku = sku[0] product_loader.add_value('sku', sku) identifier = option.select('./@value').extract()[0] if identifier not in js_options: continue product_loader.add_value('identifier', product_identifier + '_' + identifier) option_name = js_options[identifier]['color'].title( ) + ', ' + js_options[identifier]['size'].title() if option_name.startswith(', '): option_name = option_name[2:] if option_name.endswith(', '): option_name = option_name[:-2] if not option_name: if size_or_color: continue option_name = option.select('./text()').extract()[0].strip() product_loader.add_value('name', product_name + ', ' + option_name) attcolor = option.select('./@attcolor').extract() if attcolor: attcolor = attcolor[0] image_url = hxs.select('//div[@attcolor="{}"]/@lrgimg'.format( attcolor)).extract() if image_url: product_loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) price = extract_price(option.select('./@adjdefprice').extract()[0]) product_loader.add_value('price', price) product_loader.add_value('url', response.url) product_loader.add_value('brand', brand) product_loader.add_value('category', category) skustatus = option.select('./@skustatus').extract()[0] if skustatus == 'INVUVL': product_loader.add_value('stock', 0) product = product_loader.load_item() prcoff = extract_price(''.join( option.select('./@prcoff').extract())) rrp = option.select('./@retailprice').extract() rrp = extract_price(rrp[0]) rrp = str(rrp) if rrp > price else '' metadata = CRCMeta() metadata['rrp'] = rrp product['metadata'] = metadata yield product
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) image_url = hxs.select('//img[@style="cursor : hand;"]/@src').extract() category = response.meta.get('category_name') brand = response.meta.get('brand', '') rrp = '' product_rows = hxs.select('//*[@id="item_Tbl"]//tr') first_row = True product_name = None for row in product_rows: name = ''.join(row.select('./td[1]//text()').extract()).strip() if name == 'Name:': if not first_row: product_loader.add_value('name', product_name) product_loader.add_value('url', response.url) product_loader.add_value('category', category) product_loader.add_value('brand', brand) if image_url: product_loader.add_value( 'image_url', urljoin_rfc(base_url, image_url[0])) product = product_loader.load_item() metadata = CRCMeta() metadata['rrp'] = rrp product['metadata'] = metadata yield product first_row = False product_loader = ProductLoader(item=Product(), selector=hxs) product_name = ''.join( row.select('./td[2]//text()').extract()).strip() if name and name not in ('Name:', 'Price:', 'Product Code:', 'Availability:', 'Qty:'): product_name += ' - ' + ''.join( row.select('./td[2]//text()').extract()).strip() if name == 'Price:': price = row.select('./td[2]//text()').extract()[0] rrp = str( extract_price(''.join( row.select('./td[2]//text()').re(r'RRP (.*)')))) product_loader.add_value('price', extract_price(price)) if name == 'Product Code:': sku = row.select('./td[2]//text()').extract()[0] product_loader.add_value('identifier', sku) product_loader.add_value('sku', sku) if name == 'Availability:': stock = row.select('./td[2]//text()').extract()[0] if 'In stock' not in stock: product_loader.add_value('stock', 0) if product_name: product_loader.add_value('name', product_name) product_loader.add_value('brand', brand) product_loader.add_value('url', response.url) product_loader.add_value('category', category) if image_url: product_loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) product = product_loader.load_item() metadata = CRCMeta() metadata['rrp'] = rrp product['metadata'] = metadata yield product
def parse(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) parsed_ids = [] cats = hxs.select('//*[@id="sidebar"]//li[@class="sections-list"]/a/@href').extract() if cats: for cat in cats: yield Request( url=urljoin_rfc(base_url, cat) ) subcats = hxs.select('//*[@id="ContentPage"]//span[@class="boxheading"]/a/@href').extract() if subcats: for subcat in subcats: yield Request( url=urljoin_rfc(base_url, subcat) ) for url in hxs.select('//div[@id="content"]//form//a[@class="read-more"]/@href').extract(): if url: try: pid = url.split('.')[-2].split('-')[-1] if pid not in parsed_ids: parsed_ids.append(pid) yield Request(urljoin_rfc(base_url, url), callback=self.parse_product) except: pass for url in hxs.select('//div[@id="content"]//form//h1/../../a/@href').extract(): if url: try: pid = url.split('.')[-2].split('-')[-1] if pid not in parsed_ids: parsed_ids.append(pid) yield Request(urljoin_rfc(base_url, url), callback=self.parse_product) except: pass category = hxs.select('//*[@id="ContentPage"]/p[@class="text_breadcrumbs"]/a/text()').extract()[1:] for product in hxs.select('//*[@id="ContentPage"]//div[@class="col-xs-12"]'): pid = product.select('.//input[contains(@name, "Q_")]/@name').re(r'Q_(.+)') if pid: pid = pid[0] if pid not in parsed_ids: parsed_ids.append(pid) name = ''.join(product.select('.//h1//text()').extract()) url = response.url image_url = product.select('.//img[@class="catalog-image"]/@src').extract() image_url = urljoin_rfc(base_url, image_url[0]) if image_url else '' price = product.select('.//span[@class="catalog-price" or @class="product-price"]/text()').re(u'\xa3([\d\.,]+)') price = price[0] loader = ProductLoader(item=Product(), selector=product) loader.add_value('url', url) loader.add_value('name', name) loader.add_value('sku', pid) loader.add_value('identifier', pid) loader.add_value('category', category) loader.add_value('image_url', image_url) loader.add_value('price', price) yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) options = None js_line = '' for l in response.body.split('\n'): if 'variants:' in l: js_line = l break if js_line: options = demjson.decode( re.search(r'variants:(.*};)?', js_line).groups()[0][:-2].strip()) product_loader = ProductLoader(item=Product(), selector=hxs) product_identifier = hxs.select( '//input[@id="productId" or @name="productId"]/@value').extract( )[0] product_loader.add_value('identifier', product_identifier) product_loader.add_value('url', response.url) name = hxs.select('//span[@itemprop="name"]/text()').extract()[0] product_loader.add_value('name', name) category = hxs.select( '//*[@id="breadcrumb"]//a/text()').extract()[1:-1] product_loader.add_value('category', category) product_loader.add_xpath('sku', '//span[@class="pd_productVariant"]/text()') img = hxs.select('//meta[@property="og:image"]/@content').extract() if img: product_loader.add_value( 'image_url', urljoin_rfc(get_base_url(response), img.pop())) price = hxs.select( '//p[@class="productOfferPrice"]/text()').extract()[0] price = extract_price(price) product_loader.add_value('price', price) brand = hxs.select('//*[@id="brandHeader"]/a/@href').extract() if brand: brand = brand[0].replace('/en/', '')[:-1] product_loader.add_value('brand', brand) stock = ''.join( hxs.select( '//div[@class="cvos-availbility-panel"]/p/text()').extract()) if 'Item is currently out of stock online' in stock: product_loader.add_value('stock', 0) product = product_loader.load_item() metadata = FragranceDirectMeta() prom = ''.join( hxs.select('//div[@class="productSavings"]//text()').extract()) metadata['promotion'] = prom + ' ' + ''.join( hxs.select('//div[@class="primaryItemDeal"]//p/text()').extract()) if product['price']: metadata['price_exc_vat'] = Decimal( product['price']) / Decimal('1.2') product['metadata'] = metadata yield product if options: for k, val in options.items(): option_name = k.replace('_', ' ') option_product = Product(product) option_product['name'] = product['name'] + ' ' + option_name option_product['sku'] = val['productCode'] option_product['identifier'] = val['variantId'] option_product['price'] = extract_price(val['nowPrice']) if option_product.get('price'): option_product['metadata']['price_exc_vat'] = Decimal( option_product['price']) / Decimal('1.2') yield option_product
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) price = response.xpath('//script/text()').re('ecomm_totalvalue:(.*),') if not price: price = response.css('.itemPrice b::text').extract( ) or response.xpath('//span[@itemprop="price"]/text()').extract() brand = response.css('.brandImage ::attr(alt)').extract() id_text = response.xpath( '//td[@class="itemActions"]/a/@onclick').extract()[0] match = re.search( r"slrhutAdd.*?ToCart\('(.*?)','(.*?)','(.*?)','.*?'\);", id_text, re.DOTALL | re.IGNORECASE | re.MULTILINE) if match: product_identifier = "{}_{}_{}".format(match.group(1), match.group(2), match.group(3)) else: self.log( 'ERROR: Could not parse product identifier, URL: {}'.format( response.url)) return image_url = response.xpath('//img[@itemprop="image"]/@src').extract() if not image_url: image_url = hxs.select( '//td[@class="itemImage"]/img/@src').extract() product_name = response.xpath('//p[@itemprop="name"]/text()').extract() if not product_name: product_name = hxs.select( '//p[@class="itemTitleHeader"]/a/text()').extract() product_name = product_name[0].strip() category = response.xpath( '//p[@class="mainPageHeader"]//a/text()').extract() in_stock = response.xpath( '//div[@class="itemContentAvailable"]//span/text()').extract() stock = True if in_stock and 'out of stock' in in_stock[0].lower(): stock = False sku = response.xpath('//span[@itemprop="sku"]//text()').extract() if not sku: sku = hxs.select( '//div[@class="itemContentMPN" and contains(., "SKU")]//text()' ).extract() sku = sku[-1] if sku else '' product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_value('identifier', product_identifier) product_loader.add_value('name', product_name) if image_url: product_loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) product_loader.add_value('sku', sku) product_loader.add_value('price', price) product_loader.add_value('url', response.url) product_loader.add_value('category', category) product_loader.add_value('brand', brand) if not stock: product_loader.add_value('stock', 0) product = product_loader.load_item() mpn = response.xpath( '//div[@class="itemContentMPN" and contains(., "MPN")]//text()' ).extract() mpn = mpn[-1] if mpn else '' upc = response.xpath( '//div[@class="itemContentMPN" and contains(., "UPC")]//text()' ).extract() upc = upc[-1] if upc else '' if mpn or upc: metadata = EServiceGroupMeta() if mpn: metadata['mpn'] = mpn if upc: metadata['upc'] = upc product['metadata'] = metadata yield product
def parse_categories(self, response): base_url = get_base_url(response) categories = response.xpath( "//div[@class='breadcrumb']/a/text()").extract() urls = response.xpath( '//td[@class="cat_list_title"]/a/@href').extract() urls += response.xpath('//div[@id="sidebar"]//a/@href').extract() for url in urls: yield Request(urljoin_rfc(base_url, url + '/show/all'), callback=self.parse_categories, meta=response.meta) products = response.xpath( '//ul[contains(@class, "products-grid")]/li[contains(@class, "item")]' ) if products: for product in products: full_range = product.xpath( './/a[contains(text(), "Full Range")]') if full_range: continue name = product.xpath( './/h2[@class="product-name"]/text()').extract() if not name: continue name = name[0] loader = ProductLoader(item=Product(), selector=product) loader.add_value('name', name) sku = product.xpath( './/div[@class="product-description"]/p[not(contains(text(), "Stock"))]/text()' ).extract() if not sku: continue loader.add_value('sku', sku) loader.add_value('identifier', sku) url = product.xpath( './/a[@class="product-line"]/@href').extract() loader.add_value('url', url) image_url = product.xpath( './/div[@class="product-image"]/img/@src').extract() if image_url: loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) brand = response.meta.get('brand', '') loader.add_value('brand', brand) out_of_stock = product.xpath( './/p[@class="availability out-of-stock"]').extract() if out_of_stock: loader.add_value('stock', 0) price = product.xpath( './/span[@class="regular-price"]/span[@class="price"]/text()' ).extract() if not price: price = product.xpath( './/p[@class="special-price"]/span[@class="price"]/text()' ).extract() if not price and out_of_stock: continue price = extract_price(price[0]) loader.add_value('price', price) yield loader.load_item()
def parse_products_list(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) for url in hxs.select( '//div[@class="category filterCategory"]//li/a/@href').extract( ): yield Request(urljoin_rfc(base_url, url), callback=self.parse_products_list) urls = hxs.select( '//div[@class="row productList"]//div[@class="product"]/div[@class="productContent"]/a/@href' ).extract() self.log('Product # found: {}'.format(len(urls))) if len(urls) == 0: self.log('Retry: {}'.format(response.url)) retry = int(response.meta.get('retry', 0)) retry += 1 if retry < 10: yield Request(response.url, meta={'retry': retry}, dont_filter=True, callback=self.parse_products_list) else: self.log("No products found on the page on %s" % response.url) #self.retry_links.append(response.url) return for url in urls: yield Request(urljoin_rfc(base_url, url), dont_filter=True, callback=self.parse_product, cookies={}, meta={'dont_merge_cookies': True}) #next pages if 'pageSize=' not in response.url: products_count = int( hxs.select( '//div[@id="plpContent"]/div[@id="searchCounter"]/text()'). re('"productCount".*?(\d+)')[0]) self.log('products_count: {}'.format(products_count)) if products_count > 12: next_p = hxs.select( '//div[@class="facetJSON hide"]/@data-url').extract() if next_p: next_p = next_p[0] self.log('Next P: {}'.format(next_p)) for i in xrange((products_count - 1) / 12): begin_index = (i + 1) * 12 url = add_or_replace_parameter(next_p, 'beginIndex', str(begin_index)) formdata = { 'beginIndex': str(begin_index), 'scrollTo': 'false', 'requesttype': 'ajax' } yield FormRequest(url, formdata=formdata, callback=self.parse_products_list, cookies={}, meta={'dont_merge_cookies': True}) else: self.log('Retry next page: {}'.format(response.url)) retry = int(response.meta.get('retry', 0)) retry += 1 if retry < 10: yield Request(response.url, meta={'retry': retry}, dont_filter=True, callback=self.parse_products_list) else: self.log("No next page found on the page on %s" % response.url)
def parse(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) for url in hxs.select('//div[@class="list-item "]//h3/a/@href').extract(): yield Request(urljoin_rfc(base_url, url), callback=self.parse_product)
def parse_product(self, response): hxs = HtmlXPathSelector(response) if hxs.select('//p[contains(text(), "OOPS! Page not found.")]'): return product_name = hxs.select( '//h1[@class="product_title"]/text()').extract()[0].strip() category = hxs.select( '//ul[@class="pb-breadcrumb"]/li[2]/a/text()').extract() if category: category = category[0].strip() if category == 'Outlet Store': category = hxs.select( '//ul[@class="pb-breadcrumb"]/li[3]/a/text()').extract( )[0].strip() else: category = '' brand = response.meta.get('brand', '') image_url = hxs.select('//img[@id="image1"]/@src').extract()[0] image_url = image_url.rpartition('/')[0] + '/' sku = hxs.select('//span[@class="product_number"]/text()').extract( )[0].strip().replace('#', '') price = hxs.select('//span[@class="sale_price_val"]/text()').extract() if not price: price = hxs.select( '//span[@class="list_price_val"]/text()').extract() price = extract_price(price[0]) rrp = ''.join( hxs.select( '//span[contains(@class, "msrp_price_va")]/text()').extract()) if not rrp: rrp = ''.join( hxs.select('//span[contains(@class, "list_price_val")]/text()' ).extract()) rrp = extract_price(rrp) rrp = str(rrp) if rrp > price else '' product_config_reg = re.search( r'var productItems = (\[\s+\{.*\}\s+\])', response.body, re.DOTALL) if product_config_reg: json_string = product_config_reg.group(1) json_string = json_string.replace('\r', '').replace('\n', '').replace( '\t', '').replace(' ', '') json_string = json_string.replace('},]', '}]').replace('\\', '\\\\') products = json.loads(json_string) for product in products: product_loader = ProductLoader(item=Product(), selector=hxs) identifier = product['itemId'] product_loader.add_value('identifier', identifier) stock = product['inventoryNumber'] if 'Avail.' in product[ 'inVentoryMessage'] or 'OutofStock' in product[ 'inVentoryMessage']: stock = 0 product_loader.add_value('stock', stock) image = product['mainImage'] product_loader.add_value('image_url', urljoin_rfc(image_url, image)) product_loader.add_value('url', response.url) product_loader.add_value('brand', brand) product_loader.add_value('sku', sku) product_loader.add_value('price', price) product_loader.add_value('category', category) color = product['color'] name = product_name if color: name += ', ' + color size = product['size'] if len(size) > 0: size = size[0]['longform'] name += ', {}'.format(size) product_loader.add_value('name', name) product = product_loader.load_item() metadata = CRCMeta() metadata['rrp'] = rrp product['metadata'] = metadata yield product else: self.log('WARNING!!! url: {}'.format(response.url))
def parse_product(self, response): hxs = HtmlXPathSelector(response) product_loader = ProductLoader(item=Product(), selector=hxs) product_identifier = hxs.select( './/input[@name="product"]/@value').extract()[0] product_loader.add_value('identifier', product_identifier) product_loader.add_value('url', response.url) name = hxs.select( '//div[@class="product-name"]/h1/text()').extract()[0] product_loader.add_value('name', name) category = hxs.select( '//ul[@class="breadcrumbs"]/li/a/text()').extract()[1:-1] product_loader.add_value('category', category) product_loader.add_value('sku', product_identifier) img = hxs.select('//img[@id="main-img"]/@src').extract() if img: product_loader.add_value( 'image_url', urljoin_rfc(get_base_url(response), img.pop())) price = hxs.select( '//form//p[@class="special-price"]/span[@class="price"]/text()' ).extract() if not price: price = hxs.select( '//form//span[@class="regular-price"]/span[@class="price"]/text()' ).extract() price = extract_price(price[0]) if price else 0 product_loader.add_value('price', price) item = product_loader.load_item() options_config = re.search( r'var spConfig = new Product.Config\((.*)\)', response.body) if options_config: product_data = json.loads(options_config.groups()[0]) products = {} prices = {} for attr in product_data['attributes'].itervalues(): for option in attr['options']: for product in option['products']: products[product] = ' - '.join( (products.get(product, ''), option['label'])) prices[product] = prices.get( product, 0) + extract_price(option['price']) for identifier, option_name in products.iteritems(): product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_value('identifier', item['identifier'] + '_' + identifier) product_loader.add_value('sku', item['identifier'] + '_' + identifier) product_loader.add_value('name', item['name'] + ' ' + option_name) product_loader.add_value('image_url', item['image_url']) price = item['price'] + prices[identifier] product_loader.add_value('price', price) product_loader.add_value('url', response.url) product_loader.add_value('brand', '') product_loader.add_value('category', category) option_item = product_loader.load_item() yield option_item else: yield item
def parse(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) meta = response.meta.copy() categories_urls = response.xpath('//ul[@class="categoryList"]/li//a') for category in categories_urls: url = category.select('@href').extract()[0] name = category.select('text()').extract()[0].strip() if "/prl/results" not in url and 'webapp' not in url: url += "/prl/results" yield Request(urljoin_rfc(base_url, url), callback=self.parse, meta={'category': name}) products = response.xpath( '//table[@id="sProdList"]/tbody/tr[td[@class="productImage"]]') for product in products: try: identifier = product.select( './/a[@class="sku"]/text()').extract()[0].strip() stock = int( product.select( './/td[@class="availability"]/input[@class="hVal"]/@value' ).extract()[0]) price = round( Decimal( product.css( '.price input.hVal::attr(value)').extract()[0]), 2) except IndexError: continue if identifier in self.cache_data: product_cached = self.cache_data[identifier] loader = ProductLoader(item=Product(), response=response) loader.add_value('identifier', identifier) loader.add_value('name', product_cached['name'].decode('utf-8')) loader.add_value('url', product_cached['url'].decode('utf-8')) loader.add_value('sku', product_cached['sku'].decode('utf-8')) loader.add_value('category', product_cached['category'].decode('utf-8')) loader.add_value('image_url', product_cached['image_url'].decode('utf-8')) loader.add_value('brand', product_cached['brand'].decode('utf-8')) loader.add_value('price', price) loader.add_value('stock', stock) item = loader.load_item() try: self.missing_urls.remove(item['url']) except ValueError: pass yield item else: url = product.select( './/a[@class="sku"]/@href').extract()[0].strip() url = url_query_cleaner(url) if url in self.missing_urls: self.missing_urls.remove(url) yield Request(url, callback=self.parse_product, meta=meta) pages = response.css('.pages .pageIt a::attr(href)').extract() for url in pages: yield Request(urljoin_rfc(base_url, url), callback=self.parse, meta=meta) if not products and not categories_urls: yield Request(url_query_cleaner(response.url), dont_filter=True, callback=self.parse_product, meta=meta)
def parse(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) for url in hxs.select('//*[@id="navigation"]//a/@href').extract()[1:]: yield Request(urljoin_rfc(base_url, url + '?_artperpage=100'), callback=self.parse_products_list)
def parse(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) for url in response.xpath('//section[@id="results"]//a[contains(@class, "productCard__title")]/@href').extract(): yield Request(urljoin_rfc(base_url, url), callback=self.parse_product)
def parse_product(self, response): base_url = get_base_url(response) image_url = response.xpath('//img[@itemprop="image"]/@src').extract() try: product_identifier = response.xpath( '//input[@name="product"]/@value').extract()[0].strip() except: product_identifier = response.xpath( '//form[@id="product_addtocart_form"]/@action').re( r'/product/(\d+)') if not product_identifier: yield Request(response.url, callback=self.parse_product, dont_filter=True) return else: product_identifier = product_identifier[0] product_name = response.xpath( '//h2[@itemprop="name"]/text()').extract()[0] brand = response.meta.get('brand', '') category = 'Used Equipment' sku = response.xpath('//div[@class="quickfind"]/text()').extract() sku = sku[0].replace('Quick find', '').strip() if sku else '' price = response.xpath( '//*[@id="product-price-{}"]/div/span[@class="price"]/text()'. format(product_identifier)).extract()[0] price_pennies = response.xpath( '//*[@id="product-price-{}"]/div/span[@class="price"]/span[@class="price-pennies"]/text()' .format(product_identifier)).extract() if price_pennies: price += price_pennies[0] price = extract_price(price) cashback = response.xpath('//div[@class="cashback"]/text()').extract() if cashback: price += extract_price(cashback[0]) options_config = re.search( r'var spConfig = new Product.Config\((.*)\)', response.body) if options_config: product_data = json.loads(options_config.groups()[0]) products = {} prices = {} for attr_id, attr in product_data['attributes'].iteritems(): for option in attr['options']: option_price = extract_price(option['price']) for product in option['products']: products[product] = ' '.join( (products.get(product, ''), option['label'])) prices[product] = option_price for identifier, option_name in products.iteritems(): product_loader = ProductLoader(item=Product(), response=response) product_loader.add_value('identifier', product_identifier + '_' + identifier) product_loader.add_value('name', product_name + ' ' + option_name) if image_url: product_loader.add_value( 'image_url', urljoin_rfc(base_url, image_url[0])) product_loader.add_value('url', response.url) product_loader.add_value('category', category) product_loader.add_value('brand', brand) product_loader.add_value('sku', sku) product_loader.add_value('price', price + prices[identifier]) product = product_loader.load_item() yield product else: product_loader = ProductLoader(item=Product(), response=response) product_loader.add_value('identifier', product_identifier) product_loader.add_value('name', product_name) if image_url: product_loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) product_loader.add_value('url', response.url) product_loader.add_value('category', category) product_loader.add_value('brand', brand) product_loader.add_value('sku', sku) product_loader.add_value('price', price) product = product_loader.load_item() yield product