def parse_item(self, response): url = response.url hxs = HtmlXPathSelector(response) name = hxs.select( "//div[@id='primary_block']/div[@id='pb-left-column']/h2/text()" ).extract() if not name: logging.error("NO NAME! %s" % url) return name = name[0] price = hxs.select( "//p[@class='price']/span[@class='our_price_display']/span/text()" ).extract() if not price: logging.error("NO PRICE! %s" % url) return price = price[0] price = Decimal(extract_price2uk(price)) eco_tax = hxs.select( "//p[@class='price-ecotax']/span/text()").extract() if eco_tax: eco_tax[0] = eco_tax[0].encode('ascii', 'ignore') print "Found eco tax %s" % eco_tax[0] price -= Decimal(extract_price2uk(eco_tax[0])) l = ProductLoader(item=Product(), response=response) l.add_value('identifier', str(name)) l.add_value('name', name) l.add_value('url', url) l.add_value('price', unicode(price)) yield l.load_item()
def parse_item(self, response): url = response.url hxs = HtmlXPathSelector(response) name = hxs.select("//div[@id='primary_block']/div[@id='pb-left-column']/h2/text()").extract() if not name: logging.error("NO NAME! %s" % url) return name = name[0] price = hxs.select("//p[@class='price']/span[@class='our_price_display']/span/text()").extract() if not price: logging.error("NO PRICE! %s" % url) return price = price[0] price = Decimal(extract_price2uk(price)) eco_tax = hxs.select("//p[@class='price-ecotax']/span/text()").extract() if eco_tax: eco_tax[0] = eco_tax[0].encode("ascii", "ignore") print "Found eco tax %s" % eco_tax[0] price -= Decimal(extract_price2uk(eco_tax[0])) l = ProductLoader(item=Product(), response=response) l.add_value("identifier", str(name)) l.add_value("name", name) l.add_value("url", url) l.add_value("price", unicode(price)) yield l.load_item()
def _get_item_price(self, item): try: price = item.select( '//*[@id="prcIsum"]/text()').extract()[0].strip() except IndexError: try: price = item.select( '//*[@id="mm-saleDscPrc"]/text()').extract()[0].strip() except IndexError: try: price = re.search(r'"binPrice":".*[\$\xA3]([\d\.,]+)",', item.response.body).groups()[0] except AttributeError: self.errors.append("Price not found for " + item.response.url) return None # Converted price if self._converted_price: converted_price = item.select( u'//div[@id="prcIsumConv"]/span/text()').extract() price = converted_price[0] if converted_price else price if not price: return None if callable(self._check_valid_currency): currency = '' for char in price: if char.isdigit(): break currency += char if not self._check_valid_currency(currency): return None return extract_price2uk(price)
def parse_product(self, response): product = response.meta['product'].copy() price = response.css( '.pdetails .pproductpriceVAT::text').extract_first() if price: product['price'] = extract_price2uk(price) yield product
def parse_product(self, response): data = response.xpath('//script/text()').re('var context = ({.+?});') data = json.loads(data[0]) base_url = get_base_url(response) hxs = HtmlXPathSelector(response) loader = ProductLoader(selector=hxs, item=Product()) loader.add_xpath('name', '//span[contains(@id, "ProductTitle")]/text()') split_url = list(urlsplit(response.url)) url = urlunsplit(split_url[:3]+['', '']) loader.add_value('url', url) loader.add_value('brand', 'Lego') loader.add_value('category', 'Lego') price = hxs.select('//div[contains(@class, "prodprice")]/span/text()').extract()[0] price = extract_price2uk(price) loader.add_value('price', price) image_url = data['pdpProduct']['additionalMedia'][0]['url'] loader.add_value('image_url', urljoin(base_url, image_url)) loader.add_xpath('sku', '//span[@itemprop="model"]/text()') identifier = hxs.select('//span[@itemprop="productid"]/text()').extract()[0] loader.add_value('identifier', identifier) stock_url = STOCK_URL + identifier request = Request(stock_url, callback=self.parse_availability) request.meta['loader'] = loader yield request
def __construct_product(self, item, meta=None, use_seller_id_in_identifier=None): """ Constructs `Product` instance from dict """ if use_seller_id_in_identifier is None: if self.all_sellers: use_seller_id_in_identifier = True else: use_seller_id_in_identifier = False if meta and 'item' in meta: search_item = meta['item'] elif meta and 'search_item' in meta: search_item = meta['search_item'] else: search_item = self.current_search_item loader = AmazonProductLoader(item=AmazonProduct(), response=HtmlResponse('')) necessary_fields = ['name'] optional_fields = ['sku', 'image_url', 'brand', 'stock'] fields_from_search_item = ['sku', 'category', 'brand', 'identifier'] synonym_fields = { 'vendor': 'dealer', } identifier = item[ 'identifier'] if self.use_amazon_identifier else search_item.get( 'identifier') if self.semicolon_in_identifier and \ identifier and \ self.use_amazon_identifier and \ not identifier.startswith(':'): identifier = ':' + identifier if identifier and use_seller_id_in_identifier and item.get( 'seller_identifier'): identifier += ':' + item['seller_identifier'] loader.add_value('identifier', identifier) for field in necessary_fields: loader.add_value(field, item[field]) if item['price'] is not None: try: if type(item['price']) == tuple or type(item['price']) == list: item['price'] = item['price'][0] price = extract_price2uk(item['price']) if not isinstance( item['price'], Decimal) else item['price'] except Exception, e: self.log('ERROR: extracting price => PRICE: %s' % repr(item['price'])) raise e
def get_option(self, response): product = response.meta['product'] data = json.loads(response.body) product['price'] = extract_price2uk(data['unformattedPrice']) if data['combinationid']: product['identifier'] = response.meta['id'] + '-' + data['combinationid'] else: product['identifier'] = response.meta['id'] product['name'] = response.meta['name'] yield product
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) try: name = response.css( '.content-fiche-produit h1::text').extract_first().strip() except: retry = int(response.meta.get('retry', 0)) if retry < 10: retry += 1 new_meta = response.meta.copy() new_meta['retry'] = retry yield Request(response.url, meta=new_meta, callback=self.parse_product, dont_filter=True) return category = response.css('#breadcrumb a::text').extract() if category: category = category[-2] else: category = "" sku = response.css('.content-fiche-produit p::text').re( u'Référence (\d+)') pid = response.css('.content-fiche-produit p::text').re(u'Ref (\d+)') price = response.css('.new-price ::text').extract_first() stock = bool( response.xpath( '//p[contains(@class, "in-stock")]/text()').extract()) if not stock: stock = 'DISPONIBLE' in ''.join( response.xpath('//p[contains(@class, "availability")]//text()' ).extract()).upper() if price: loader = ProductLoader(response=response, item=Product()) loader.add_value('url', urljoin(base_url, response.url)) loader.add_value('name', name) loader.add_css('image_url', '#image ::attr(src)') loader.add_value('price', extract_price2uk(price)) loader.add_value('category', category) loader.add_value('sku', sku) loader.add_value('identifier', pid) loader.add_value('brand', response.meta.get("brand", "")) #loader.add_value('stock', int(stock)) yield loader.load_item() else: self.errors.append("No price set for url: '%s'" % urljoin(base_url, response.url))
def parse_search(self, response): hxs = HtmlXPathSelector(response) count_el = hxs.select( "//table[@id='ctl05_myContainer']/tr[3]/td[2]/div[@id='ctl05_ctl12']/h1/text()" ).extract() count = '0' for el in count_el: m = re.search("[\d]+", el) if m: count = m.group(0) else: count = '0' logging.error("Found %s hotels" % count) hotels = hxs.select( "//div[@id='divResults']/div[@class='accomodation grey'] | \ //div[@id='divResults']/div[@class='accomodation']" ) for hotel in hotels: name = hotel.select("div[1]/h4/a/text()").extract() if not name: logging.error("No name") name = name[0] url = hotel.select("a[1]/@href").extract() if not url: logging.error("No url %s") url = url[0] price = hotel.select( "div[@class='price']/span[@class='sum2']/text()").extract() if not price: logging.error("No price") price = price[0] price = extract_price2uk(price) if price is None: print "No price %s" % name continue price = int(price) * nights l = ProductLoader(item=Product(), response=response) l.add_value('name', name.encode('ascii', 'replace')) l.add_value('identifier', name.encode('ascii', 'replace')) l.add_value('url', url) l.add_value('price', price) yield l.load_item() yield self.get_city_request()
def parse_product(self, response): price = response.css('div.pprice .price ::text').extract() if price: price = extract_price2uk(price[0]) stock = 1 else: price = 0 stock = 0 in_stock = bool(response.xpath('//*[contains(@class, "availability") and contains(@class, "in-stock")]')) if not in_stock: stock = 0 identifier = response.xpath('//input[@name="product"]/@value').extract() sku = map(unicode.strip, response.xpath('//div[@class="product-name"]/*[@class="sku_prd"]/text()').re(r'Product Code:(.*)')) category = filter(lambda s: bool(s), map(unicode.strip, response.xpath('//*[@itemtype="http://schema.org/BreadcrumbList"]' '//*[contains(@itemprop, "name")]/text()').extract()))[1:-1] name = response.xpath('//div[@class="product-name"]/h1/text()').extract() loader = ProductLoader(item=Product(), response=response) loader.add_value('name', name) loader.add_value('price', price) loader.add_value('stock', stock) loader.add_xpath('brand', '//div[@class="product-essential"]//a[@class="man_img"]/@title') loader.add_value('identifier', identifier) loader.add_value('sku', sku) loader.add_xpath('image_url', '//meta[@property="og:image"]/@content') loader.add_value('shipping_cost', 0) loader.add_value('url', response.url) loader.add_value('category', category) main_product = loader.load_item() option_boxes = response.xpath('//div[@id="product-options-wrapper"]//select') if option_boxes: product_config = re.findall(string=response.body, pattern=r'var spConfig = new Product.Config\((.*)?\);') if product_config: product_data = json.loads(product_config[0]) products = {} for attr in product_data['attributes'].itervalues(): for option in attr['options']: for opt_id in option['products']: products[opt_id] = ' - '.join((products.get(opt_id, ''), option['label'])) for identifier, option_name in products.iteritems(): new_item = Product(main_product) new_item['identifier'] += '_' + identifier new_item['name'] += option_name yield new_item else: yield main_product
def parse_product(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) loader = ProductLoader(selector=hxs, item=Product()) brand = hxs.select( '//label[@class="jda-brand-name"]/text()').extract()[0] brand = fix_spaces(brand) if brand.title() != 'Lego': return name = hxs.select('//div[@id="right-side"]//h1/text()').extract()[0] name = fix_spaces(name) loader.add_value('name', name) loader.add_value('url', response.url) price = hxs.select( '//div[@id="price"]//dd[@class="ours"]/text()').extract() if price: price = extract_price2uk(price[0]) loader.add_value('price', price) img_url = hxs.select('//div[@id="bubble-wrapper"]//img/@src').extract() if img_url: loader.add_value('image_url', urljoin(base_url, img_url[0])) loader.add_value('category', 'Lego') loader.add_value('brand', 'Lego') stock = hxs.select('//*[@id="product-out-of-stock"]/a/img').extract() if stock: stock = 0 else: stock = 1 loader.add_value('stock', stock) loader.add_xpath('identifier', '//input[@name="productId_0"]/@value') if ')' in name: sku = name.split('(')[-1] sku = sku.split(')')[0] loader.add_value('sku', sku) yield loader.load_item()
def parse_search(self, response): hxs = HtmlXPathSelector(response) count_el = hxs.select("//table[@id='ctl05_myContainer']/tr[3]/td[2]/div[@id='ctl05_ctl12']/h1/text()").extract() count = '0' for el in count_el: m = re.search("[\d]+", el) if m: count = m.group(0) else: count = '0' logging.error("Found %s hotels" % count) hotels = hxs.select("//div[@id='divResults']/div[@class='accomodation grey'] | \ //div[@id='divResults']/div[@class='accomodation']") for hotel in hotels: name = hotel.select("div[1]/h4/a/text()").extract() if not name: logging.error("No name") name = name[0] url = hotel.select("a[1]/@href").extract() if not url: logging.error("No url %s") url = url[0] price = hotel.select("div[@class='price']/span[@class='sum2']/text()").extract() if not price: logging.error("No price") price = price[0] price = extract_price2uk(price) if price is None: print "No price %s" % name continue price = int(price)*nights l = ProductLoader(item=Product(), response=response) l.add_value('name', name.encode('ascii', 'replace')) l.add_value('identifier', name.encode('ascii', 'replace')) l.add_value('url', url) l.add_value('price', price) yield l.load_item() yield self.get_city_request()
def variant_sum(self, response): data = json.loads(response.body) options = data.get('variables') if options: option_type_id = url_query_parameter(response.url, 'optionId') for option in options: url = add_or_replace_parameter(response.url, 'variableIds[%s]' %option_type_id, option) meta=response.meta.copy() meta['product'] = Product(response.meta['product']) next_options = response.meta.get('next_options')[:] if next_options: url = add_or_replace_parameter(url, 'optionId', next_options.pop(0)) meta['next_options'] = next_options[:] else: url = ''.join(url.split('sMenu')) url = url_query_cleaner(url, ('optionId',), remove=True) meta['option_name'] = response.meta['option_name'] + ' ' + options[option] yield Request(url, self.variant_sum, meta=meta) return product = Product(response.meta['product']) product['price'] = extract_price2uk(data['price']) product['name'] = fix_spaces(' '.join((response.meta['product_name'], response.meta['option_name']))) product['identifier'] = response.meta['product_id'] + '-' + data['id'] yield product
def parse_product(response): hxs = HtmlXPathSelector(response) opt_groups = [] inside = False lst = '' for line in response.body.split('\n'): if line.startswith('perms[\''): inside = True lst = '' elif line.startswith('];'): if lst: opts = eval('[' + lst + ']') # XXX http://www.thesleepshop.co.uk/acatalog/4ft6_Double_Kyoto_Memphis_Futon.html#a11717 # second option has "Deluxe Mattress" twice with different additional price # however price calculation ignores second addition price (uses first value) filtered_opts = [] for price, name in opts: if not [name for pn in filtered_opts if pn[1] == name]: filtered_opts.append([price, name]) opt_groups.append(filtered_opts) inside = False elif inside: lst += line identifier = hxs.select( '//form//input[contains(@name, "Q_")]/@name').re(r'Q_(.*)$')[0] product_loader = ProductLoader(item=Product(), response=response) product_loader.add_value('url', response.url) product_loader.add_xpath('name', u'//h3[@class="product"]/text()') product_loader.add_xpath('name', u'//span[@class="product"]/text()') product_loader.add_value('sku', identifier) product_loader.add_value('identifier', identifier) product_loader.add_value('category', response.meta.get('category')) product_loader.add_css('price', '.discprice::text') price_reg = response.xpath( '//div[@id="price_inside"]//span//text()').extract_first( ) or response.xpath( '//div[@id="price_inside"]//span/@ppraw').extract_first() price_reg = extract_price2uk(price_reg) product_loader.add_value('price', price_reg) product_loader.add_value('price', '') discount = product_loader.get_output_value('price') / price_reg img = hxs.select( u'//div[@class="slides_control"]/a/img/@src').extract() if not img: img = hxs.select( u'//div[@class="image_product"]//img/@src').extract() product_loader.add_value('image_url', urljoin_rfc(get_base_url(response), img[0])) brand_logo = hxs.select( u'//h3[@class="product"]/../img/@src').extract() if not brand_logo: brand_logo = hxs.select( u'//h3[@class="product"]/img/@src').extract() brands = { '6thsense.jpg': '6th sense', 'bentley.gif': 'bentley', 'birlea.gif': 'birlea', 'blank.gif': '', 'brand': '', 'Breasley.gif': 'breasley', 'buoyant.jpg': 'buoyant', 'cro.gif': 'cro', 'cumfilux.gif': 'cumfilux', 'dt.gif': 'dt', 'dunlopillo.gif': 'dunlopillo', 'durabeds.gif': 'durabeds', 'easycomfort.gif': 'easy comfort', 'friendship_mill.gif': 'friendship mill', 'Furmanac.gif': 'furmanac', 'gainsborough.gif': 'fainsborough', 'gleneagle.gif': 'gleneagle', 'harlequin.gif': 'harlequin', 'harmony.gif': 'harmony', 'healthbeds.gif': 'healt beds', 'highgate.gif': 'highgate', 'hypnos.gif': 'hypnos', 'jay-be.gif': 'jay be', 'julianbowenlogo.jpg': 'julian bowen', 'kaymed.gif': 'kaymed', 'komfi.gif': 'komfi', 'kyoto.gif': 'kyoto', 'limelight.gif': 'limelight', 'metalbeds.gif': 'metalbeds', 'millbrook.gif': 'millbrook', 'myers.gif': 'myers', 'nd.gif': 'newdesign', 'nestledown.gif': 'nestledown', 'obc.gif': 'original bedstead', 'Protectabed.gif': 'protectabed', 'rauch.gif': 'rauch', 'relaxsan.gif': 'relaxsan', 'relyon.gif': 'relyon', 'rest_assured.gif': 'rest assured', 'richman.gif': 'richman', 'sealy.gif': 'sealy', 'shakespeare.gif': 'shakespeare', 'silentnight.gif': 'silentnight', 'sleepeezee.gif': 'sleepeezee', 'sleepshaper.gif': 'sleepshaper', 'sleepyvalley.gif': 'sleepyvalley', 'slumberland.gif': 'slumberland', 'staples.gif': 'staples', 'steens.gif': 'steens', 'swanglen.gif': 'swanglen', 'sweetdreams.gif': 'sweetdreams', 'tss.gif': 'the sleep shop', 'verona.jpg': 'verona', 'welcome.gif': 'welcome furniture', } product_loader.add_value( 'brand', brands.get(brand_logo[0], remove_extension(brand_logo[0]))) product = product_loader.load_item() for opt_price, opt_name in multiply(opt_groups): prod = Product(product) prod['name'] = (prod['name'] + ' ' + opt_name).strip() try: prod['price'] = (Decimal(prod['price']) + Decimal(opt_price) * discount).quantize( Decimal('1.00')) except TypeError: prod['price'] = Decimal(0) prod['identifier'] = prod['identifier'] + ':' + opt_name yield prod
bushnell_product = self.bushnell_products.get( loader.get_output_value('sku').upper().strip(), None) if bushnell_product: category = bushnell_product['Class'] self.log('Extracts category "%s" from bushnell file, URL: %s' % (category, loader.get_output_value('url'))) if category: if isinstance(category, list): for cat in category: loader.add_value('category', cat) else: loader.add_value('category', category) else: loader.add_value('category', '') if item.get('shipping_cost', None): loader.add_value( 'shipping_cost', extract_price2uk(item['shipping_cost']) if not isinstance(item['shipping_cost'], Decimal) else item['shipping_cost']) for synonym_field, field in synonym_fields.items(): if synonym_field in item: value = item[synonym_field] loader.add_value(field, value) product = loader.load_item() return product
def parse(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) categories = hxs.select("//td[@class='columnLeft']/div[@id='nav']//a/@href").extract() for category in categories: url = urljoin_rfc(base_url, category) yield Request(url, callback=self.parse) items_table = hxs.select("//table[@class='productsBox']/tr/td[@class='newProducts']") for item in items_table: name = item.select("h2/a/text()").extract() if not name: logging.error("%s - ERROR! NO NAME!" % response.url) continue name = name[0] url = item.select("h2/a/@href").extract() if not url: logging.error("%s - ERROR! NO URL!" % response.url) continue url = url[0] url = urljoin_rfc(base_url, url) price = item.select(".//div[@class='price']/text()").extract() if not price: logging.error("%s - %s - ERROR! NO PRICE!" % (response.url, name)) continue price = price[-1] l = ProductLoader(item=Product(), response=response) l.add_value("identifier", str(name)) l.add_value("name", name) l.add_value("url", url) l.add_value("price", price) yield l.load_item() items_list = hxs.select("//table[@class='cartTbl']/tr") for item in items_list: name = item.select("td[2]/a/text()").extract() if not name: logging.error("%s - ERROR! NO NAME!" % response.url) continue name = name[0] url = item.select("td[2]/a/@href").extract() if not url: logging.error("%s - ERROR! NO URL!" % response.url) continue url = url[0] url = urljoin_rfc(base_url, url) price = item.select("td[2]/text()").extract() if not price: logging.error("%s - %s - ERROR! NO PRICE!" % (response.url, name)) continue price = " ".join(price) if not extract_price2uk(price): logging.error("%s - %s - ERROR! NO PRICE!" % (response.url, name)) continue l = ProductLoader(item=Product(), response=response) l.add_value("identifier", str(name)) l.add_value("name", name) l.add_value("url", url) l.add_value("price", price) yield l.load_item()
def parse_products(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) products = hxs.select('//ul[@id="product_list"]/li') for product in products: try: identifier = product\ .select('.//a[contains(@class, "ajax_add_to_cart_button")]/@href')\ .re(r'^.*&id_product=(\d+)&token') name = product.select('.//h3/a/@title').extract().pop().strip() url = urljoin_rfc( base_url, product.select('.//h3/a/@href').extract().pop().strip()) price = extract_price2uk( product.select( './/div[@class="content_price"]' '/*[@class="price"]/text()').extract().pop().strip()) image = urljoin_rfc( get_base_url(response), product.select('.//a[@class="product_img_link"]' '/img/@src').extract().pop().strip()) category = None try: category = hxs.select('//span[@class="navigation_page"]' '/text()').extract().pop().strip() except: pass except: pass else: if not identifier: loader = ProductLoader(response=response, item=Product()) loader.add_value('name', name) loader.add_value('url', url) loader.add_value('brand', category) loader.add_value('price', price) loader.add_value('image_url', image) yield Request(url, meta={'product': loader.load_item()}, callback=self.parse_identifier) else: loader = ProductLoader(response=response, item=Product()) loader.add_value('name', name) loader.add_value('identifier', identifier) loader.add_value('url', url) loader.add_value('brand', category) loader.add_value('price', price) loader.add_value('image_url', image) yield loader.load_item() next_page = hxs.select( '//div[@id="pagination"]' '//li[@id="pagination_next"]/a/@href').extract() if next_page: yield Request(urljoin_rfc(base_url, next_page.pop().strip()), callback=self.parse_products)
def parse_product(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) name = hxs.select('//h1/text()').extract() if not name: return else: name = name[0] identifier = hxs.select( '//input[@name="product_id"]/@value').extract()[0] price = hxs.select( '//div[@class="price"]/div[@id="myoc-lpu"]/text()').extract() if price: price = extract_price2uk(price[0]) stock = 1 else: price = Decimal(0) stock = 0 loader = ProductLoader(selector=hxs, item=Product()) loader.add_value('name', name) loader.add_value('price', price) loader.add_value('stock', stock) loader.add_value('identifier', identifier) loader.add_value('sku', identifier) loader.add_xpath('image_url', '//a[@class="thumbnail"]/img/@src') loader.add_value('url', response.url) loader.add_value('shipping_cost', 0) for category in hxs.select( '//ul[@class="breadcrumb"]/li/a/text()')[:-1].extract(): loader.add_value('category', category) loader.add_xpath('brand', '//li[contains(text(), "Brand")]/a/text()') product = loader.load_item() option_boxes = hxs.select( '//select[@class="form-control" and contains(@id, "option")\ and not(contains(./option/., "V.A.T."))\ and not(contains(./option/., "VAT"))\ and not(contains(./option/., "Delivery"))]') if not option_boxes: yield product return options_dict = dict() options = [] for option_box in option_boxes: option_group = [] for option in option_box.select( './option[@value!="" and not(contains(.,"VAT Exempt"))]'): option_id = option.select('./@value')[0].extract() option_name = option.select('./text()')[0].extract() option_price = re.search(u'\(\+\xa3(.*)\)', option_name) option_price = Decimal( option_price.group(1)) if option_price else Decimal('0.00') option_name = re.sub('VAT Payable ?-? ?', '', option_name) option_name = re.sub(u'\(\+\xa3(.*)\)', '', option_name).strip() options_dict[option_id] = { 'name': option_name, 'price': option_price } option_group.append(option_id) options.append(option_group) options = itertools.product(*options) for option in options: option_name = ' '.join( [options_dict[option_id]['name'] for option_id in option]) option_price = sum( [options_dict[option_id]['price'] for option_id in option]) option = sorted(option) option_identifier = '-'.join(option) product['identifier'] = '-'.join((identifier, option_identifier)) product['price'] = price + option_price product['name'] = fix_spaces(' '.join((name, option_name))) yield product
def parse(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) categories = hxs.select( "//td[@class='columnLeft']/div[@id='nav']//a/@href").extract() for category in categories: url = urljoin_rfc(base_url, category) yield Request(url, callback=self.parse) items_table = hxs.select( "//table[@class='productsBox']/tr/td[@class='newProducts']") for item in items_table: name = item.select("h2/a/text()").extract() if not name: logging.error("%s - ERROR! NO NAME!" % response.url) continue name = name[0] url = item.select("h2/a/@href").extract() if not url: logging.error("%s - ERROR! NO URL!" % response.url) continue url = url[0] url = urljoin_rfc(base_url, url) price = item.select(".//div[@class='price']/text()").extract() if not price: logging.error("%s - %s - ERROR! NO PRICE!" % (response.url, name)) continue price = price[-1] l = ProductLoader(item=Product(), response=response) l.add_value('identifier', str(name)) l.add_value('name', name) l.add_value('url', url) l.add_value('price', price) yield l.load_item() items_list = hxs.select("//table[@class='cartTbl']/tr") for item in items_list: name = item.select("td[2]/a/text()").extract() if not name: logging.error("%s - ERROR! NO NAME!" % response.url) continue name = name[0] url = item.select("td[2]/a/@href").extract() if not url: logging.error("%s - ERROR! NO URL!" % response.url) continue url = url[0] url = urljoin_rfc(base_url, url) price = item.select("td[2]/text()").extract() if not price: logging.error("%s - %s - ERROR! NO PRICE!" % (response.url, name)) continue price = " ".join(price) if not extract_price2uk(price): logging.error("%s - %s - ERROR! NO PRICE!" % (response.url, name)) continue l = ProductLoader(item=Product(), response=response) l.add_value('identifier', str(name)) l.add_value('name', name) l.add_value('url', url) l.add_value('price', price) yield l.load_item()
def load_item(self, item, name, identifier, price, response): try: category = item.select( '//td[@id="vi-VR-brumb-lnkLst"]//span[@itemprop="name"]/text()' ).extract().pop() except IndexError: category = '' seller_id = ''.join( item.select('.//*[contains(@class, "si-content")]' '//a/*[@class="mbg-nw"]/text()').extract()) brand = response.meta['item_meta'].get('brand') if not brand: brand = filter( lambda s: s.strip() != '', item.select( '//*[@class="attrLabels" and contains(text(), "Brand")]' '/following-sibling::*[1]//text()').extract()) if not brand: brand = filter( lambda s: s.strip() != '', item.select( '//*[@class="attrLabels" and contains(text(), "Brand")]' '/following-sibling::*[1]/h2/text()').extract()) if not brand: brand = filter( lambda s: s.strip() != '', item.select( '//*[@class="attrLabels" and contains(text(), "Brand")]' '/following-sibling::*[1]/h3/text()').extract()) if not brand: brand = filter( lambda s: s.strip() != '', item.select( '//*[@class="attrLabels" and contains(text(), "Marke")]' '/following-sibling::*[1]//text()').extract()) if not brand: brand = filter( lambda s: s.strip() != '', item.select( '//*[@class="attrLabels" and contains(text(), "Hersteller")]' '/following-sibling::*[1]//text()').extract()) if not brand: brand = filter( lambda s: s.strip() != '', item.select( '//*[@class="attrLabels" and contains(text(), "Marque")]' '/following-sibling::*[1]//text()').extract()) product_loader = ProductLoader(item=Product(), selector=item) for field in self._match_fields: product_loader.add_value( field, response.meta['item_meta'].get(field, None)) product_loader.add_value('name', name) product_loader.add_value('category', category) product_loader.add_value('dealer', 'eBay - ' + seller_id) product_loader.add_value('identifier', identifier) if brand: if type(brand) == list: product_loader.add_value('brand', brand[0]) else: product_loader.add_value('brand', brand) product_loader.add_xpath('image_url', '//img[@id="icImg"]/@src') product_loader.add_value('url', item.response.url) price = price if price is not None else self._get_item_price(item) product_loader.add_value('price', price) # stock amount if self._extract_stock_amount: stock = '' try: in_stock = ''.join( item.select('//*[@id="qtySubTxt"]//text()').extract()) stock = '' for match in re.finditer(r"([\d]+)", in_stock): if len(match.group()) > len(stock): stock = match.group() if 'More than' in in_stock: stock = 11 except: pass if stock: product_loader.add_value('stock', stock) # shipping cost try: shipping_cost = item.select( '//*[@id="shippingSection"]//td/div/text()').extract()[0] if shipping_cost: if 'free' in shipping_cost.lower(): product_loader.add_value('shipping_cost', 0) else: product_loader.add_value('shipping_cost', extract_price2uk(shipping_cost)) except IndexError: pass return product_loader
def parse_product(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) loader = ProductLoader(selector=hxs, item=Product()) brand = hxs.select('//section[contains(@class, "product-variations")]/div/div[2]/span/@data-img-attributes').extract() if not brand: brand = hxs.select('//u[./a/@href="#product-details"]/preceding-sibling::div[last()]//@data-img-attributes').extract() if brand: brand = re.findall(r'alt="(.*)"', brand[0])[0] sku = hxs.select('//h1[contains(@class, "product-title")]/following-sibling::p/text()').extract() sku = re.findall(r'\#(.*)', sku[0]) image_url = hxs.select('//a[@class="thumbnail"]/@href').extract() if image_url: image_url = urljoin(base_url, image_url[0]) price = hxs.select('//span[@class="price-price"]/text()').extract() if price: price = extract_price2uk(price[0]) stock = 1 else: price = 0 stock = 0 product_id = response.xpath('//input[@name="product_id"]/@value').extract_first() name = response.xpath('//h1[contains(@class, "product-title")]/text()').extract()[0] loader.add_value('name', name) loader.add_value('price', price) loader.add_value('brand', brand) loader.add_value('sku', sku) loader.add_value('image_url', image_url) loader.add_value('url', response.url) loader.add_value('identifier', product_id) loader.add_value('shipping_cost', 0) loader.add_value('stock', stock) product = loader.load_item() selects = response.css('.product-variations select') if not selects: yield product return selected_ids = response.xpath('//@data-selected-variable-ids').extract_first() selected_ids = json.loads(selected_ids) options = selects.pop(0).xpath('option[@value!=""]') get_sum_url = 'http://www.bigbrandbeds.co.uk/admin/controller/ProductVariations/getVariationData?productId=%s' get_menu_url = 'http://www.bigbrandbeds.co.uk/admin/controller/ProductVariations/getVariationsMenuData?productId=%s&optionId=%s' if selected_ids: form = {'variableIds[%s]' %var_id: str(selected_ids[var_id]) for var_id in selected_ids if selected_ids[var_id]} else: form = dict() if selects: url = get_menu_url %(product_id, selects.pop(0).xpath('@data-variations-menu').extract_first()) else: url = get_sum_url %product_id options_name = '' for option in options: option_type_id = option.xpath('../@data-variations-menu').extract_first() if option_type_id: form['variableIds[%s]' %option_type_id] = option.xpath('@value').extract_first() option_name = option.xpath('text()').extract_first() request = FormRequest(url, formdata=form, method="GET", callback=self.variant_sum, dont_filter=True) request.meta['product'] = Product(product) #request.meta['option_id'] = value request.meta['option_name'] = option_name request.meta['product_name'] = name request.meta['product_id'] = product_id if selects: request.meta['next_options'] = selects.xpath('@data-variations-menu').extract()[:] else: request.meta['next_options'] = [] yield request