def parse_options(self, response): product = response.meta.get('product', Product()) subid = re.findall('sub_product.product_id = ([0-9]+);', response.body) subname = re.findall('sub_product.product_name = "(.*)";', response.body) subprice = re.findall('sub_product.price = "(.*)";', response.body) if subid and subname: for id, name, price in zip(subid, subname, subprice): option = deepcopy(product) loader = ProductLoader(item=option, selector="") loader.replace_value('name', name.decode("unicode_escape")) loader.replace_value('identifier', "%s-%s" % (option['identifier'], id)) price = price.decode("unicode_escape").replace(":", "").replace( "-", "") stock = 1 if price and not price == '0' else 0 loader.replace_value('price', price) loader.add_value('stock', stock) yield loader.load_item() else: loader = ProductLoader(item=product, selector="") stock = 1 if loader.get_value('price') else 0 loader.replace_value('stock', stock) yield loader.load_item()
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) brand = hxs.select( "//*[contains(text(),'Dise') and contains(text(),'ador:')]/text()" ).extract() brand = brand[0].split(':')[1].strip() if brand else '' option_specs = [] product_options = hxs.select( u'//select[@class="form" and contains(@onchange, "actualiza_atributos")]/option/text()' ).extract() if product_options: # Extract product options and price for option_text in product_options: parts = re.split(r'[()]', option_text, 2) option_name = parts[0] part_len = len(parts) if part_len == 1: price_diff = 0 else: price_spec = parts[1] modifier = -1 if price_spec.startswith('-') else 1 price_diff = price_spec.replace('+', '').replace('-', '') price_diff = Decimal(spanishDecimal(price_diff)) * modifier option_specs.append({ 'extra_name': option_name, 'price_diff': price_diff }) else: option_specs.append({'extra_name': '', 'price_diff': 0}) for option_spec in option_specs: extra_name = option_spec['extra_name'] price_diff = option_spec['price_diff'] category = hxs.select( u'//td[@class="cont_heading_td"]/span[@class="sub_cont_heading_td"]/text()' ).extract() category = category[0] if category else '' image_url = hxs.select( u'(//a[@rel="fotografias"])[1]/@href').extract() if image_url: image_url = urljoin_rfc(get_base_url(response), image_url[0]) name = hxs.select( u'//td[@class="cont_heading_td"]/h1[last()]/text()').extract( )[0] product_loader = ProductLoader(item=Product(), response=response) if extra_name: product_loader.add_value( 'name', "%s - %s" % (name.strip(), extra_name.strip())) else: product_loader.add_value('name', name.strip()) product_loader.add_value('url', response.url, Compose(stripSessionId)) product_loader.add_value('category', category) product_loader.add_value('brand', brand) product_loader.add_value('image_url', image_url, Compose(stripSessionId)) if extra_name: identifier = product_loader.get_value(response.url, TakeFirst(), re='p-([0-9]+)\.html') id_n_ext_name = "%s-%s" % (identifier, extra_name) product_loader.add_value('identifier', id_n_ext_name) else: product_loader.add_value('identifier', response.url, TakeFirst(), re='p-([0-9]+)\.html') product_loader.add_xpath('sku', '//td[contains(text(), "Ref:")]/text()', TakeFirst(), re='Ref: (.+)') price = hxs.select('//td[@class="preu"]/text()[1]').extract()[0] price = Decimal(spanishDecimal(price)) if price_diff: price = price + price_diff product_loader.add_value('price', price) product_loader.add_value('stock', 1) yield product_loader.load_item() # parse product options more_products = hxs.select( u'//div[@class="product_section_sub"][1]/a[@title]/@href' ).extract() _, _, urlpath = response.url.partition('/product-pol') url_to_remove = "/product-pol%s" % urlpath final_more_products = list( set(more_products) - set([url_to_remove])) # parse product for product_url in final_more_products: product_url = urljoin_rfc(get_base_url(response), product_url) yield Request(product_url, callback=self.parse_product)