def parse_products(self, response): hxs = HtmlXPathSelector(response) products = hxs.select( '//div[@id="productListing"]//h5[a[contains(@class, "product-name")]]/..' ) for product in products: loader = ProductLoader(item=Product(), selector=product) loader.add_xpath('name', './/a/text()') url = product.select('.//a/@href').extract()[0] loader.add_value('url', url) price = product.select( '..//div[@class="product-buttons"]//span[@class="sellPrice"]/text()' ).extract() if not price: price = product.select( '..//div[@class="product-buttons"]//div[@class="productSpecialPrice"]/span/text()' ).extract() loader.add_value('price', price[0]) yield Request(url, callback=self.parse_product, meta={'loader': loader}) #loader.load_item() next = hxs.select('//a[@title=" Next Page "]/@href').extract() if next: url = urljoin_rfc(get_base_url(response), next[0]) yield Request(url, callback=self.parse_products)
def parse_product(self, response): hxs = HtmlXPathSelector(response) redirected_urls = response.meta.get('redirect_urls', None) if redirected_urls: log.msg('Skips product, redirected url: ' + str(redirected_urls[0])) return product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_value('url', response.url) product_loader.add_value( 'identifier', re.search('p-(\d+)\.html', response.url).group(1)) name = hxs.select( u'//td[@class="pageHeading" and @valign="top" and not(@align)]/text()' ).extract()[0] product_loader.add_value('name', name) price = ''.join( hxs.select( u'//td[@class="pageHeading" and @valign="top" and @align="right"]/text()' ).extract()).strip() if not price: price = ''.join( hxs.select( u'//td[@class="pageHeading" and @valign="top" and @align="right"]/span[@class="productSpecialPrice"]/text()' ).extract()) product_loader.add_value('price', price) product_loader.add_xpath( 'sku', u'//td[@class="pageHeading" and @valign="top" and not(@align)]/span[@class="smallText"]/text()', re='\[(.*)\]') product_loader.add_value('category', response.meta.get('category')) image_url = hxs.select( u'//a[contains(@href,"images") and child::img]/@href').extract() if image_url: image_url = urljoin_rfc(get_base_url(response), image_url[0]) product_loader.add_value('image_url', image_url) # product_loader.add_xpath('brand', u'') brand = '' brands = hxs.select( '//form[@name="manufacturers"]/select/option/text()').extract() for brand in brands: if '..' in brand: incomplete_brand = ' '.join(brand.split()[:-1]) if incomplete_brand.lower() in name.lower(): product_loader.add_value('brand', brand.replace('..', '')) else: if brand.lower() in name.lower(): product_loader.add_value('brand', brand.replace('..', '')) break yield product_loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_value('url', response.url) product_loader.add_xpath('name', u'//div[@itemprop="name"]/text()') price = hxs.select(u'//span[@itemprop="price"]/text()').extract() price = price[0] if price else '0' product_loader.add_value('price', price) product_id = hxs.select( u'//form//input[@type="hidden" and @name="products_id"]/@value' ).extract() if not product_id: product_id = hxs.select( '//div[@id="productTellFriendLink"]/a/@href').re( 'products_id=(.*)') if not product_id: product_id = re.findall(r'products_id=(.*)" class', response.body) if not product_id: log.msg('Product without identifier: ' + response.url) return product_loader.add_value('identifier', product_id[0]) sku = hxs.select(u'//span[@itemprop="identifier"]/text()').extract() if sku: product_loader.add_value('sku', sku[0]) product_loader.add_xpath('category', u'//div[@id="navBreadCrumb"]/a[2]/text()') img = hxs.select(u'//div[@id="productMainImage"]//img/@src').extract() if img: img = urljoin_rfc(get_base_url(response), img[0]) product_loader.add_value('image_url', img) brand = hxs.select('//li[@itemprop="brand"]/text()').extract() if brand: brand = brand[0].replace('Manufactured by: ', '') product_loader.add_value('brand', brand) product = product_loader.load_item() if product['price'] > 0: yield product """
def parse_product(self, response): hxs = HtmlXPathSelector(response) product_loader = AxeMusicProductLoader(item=Product(), selector=hxs) product_loader.add_value('url', response.url) product_loader.add_xpath('name', u'//div[@class="product-name"]/h1/text()') price = hxs.select( u'//div[@class="price-box"]//span[@class="price"]/text()').extract( ) if price: price = price[0].strip() product_loader.add_value('price', price) else: return product_loader.add_xpath( 'sku', u'//div[@class="sku"]/span[@class="value"]/text()') product_loader.add_xpath( 'category', u'//div[@class="breadcrumbs"]/ul/li[2]/a/span/text()') img = hxs.select('//img[@id="image-main"]/@src').extract() if img: img = urljoin_rfc(get_base_url(response), img[0]) product_loader.add_value('image_url', img) identifier = hxs.select('//meta[@itemprop="productID"]/@content').re( 'sku:(.*)')[0] product_loader.add_value('identifier', identifier) product_loader.add_value( 'brand', self._get_brand_from_name(product_loader.get_output_value('name'))) #stock_status = ''.join(hxs.select('//p[@class="availability in-stock"]/h10/text()').extract()).strip() # if stock_status: # if 'OUT OF STOCK' in stock_status.upper(): # product_loader.add_value('stock', 0) yield product_loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_value('url', response.url) product_loader.add_xpath('name', u'//span[@itemprop="name"]/text()') price = hxs.select(u'//form[@id="vCSS_mainform"]//span[@itemprop="price"]/text()').extract() price = price[0] if price else u'0' product_loader.add_value('price', price) product_loader.add_xpath('sku', u'//span[@class="product_code"]/text()') product_loader.add_xpath('identifier', u'//span[@class="product_code"]/text()') product_loader.add_xpath('category', u'//td[@class="vCSS_breadcrumb_td"]//a[position()=2]/@title') product_loader.add_xpath('image_url', u'concat("http://lamusic.ca",//img[@id="product_photo"]/@src)') product_loader.add_xpath('brand', u'//meta[@itemprop="manufacturer"]/@content') availability_label = ''.join(hxs.select('//b[contains(text(), "Availability:")]/text()').extract()).strip() # in_stock = 'IN STOCK' in ''.join(hxs.select('//div[@itemprop="offers"]/text()').extract()).strip().upper() # if availability_label and not in_stock: # product_loader.add_value('stock', 0) if hxs.select(u'//img[@class="vCSS_img_icon_free_shipping"]'): product_loader.add_value('shipping_cost', '0') product = product_loader.load_item() if hxs.select(u'//tr[@class="Multi-Child_Background"]'): for opt in hxs.select(u'//tr[@class="Multi-Child_Background"]'): p = Product(product) p['sku'] = opt.select(u'./td[1]/text()').extract()[0].strip() p['identifier'] = opt.select(u'./td[1]/text()').extract()[0].strip() p['name'] = opt.select(u'./td[2]/text()').extract()[0].strip() try: p['price'] = opt.select(u'./td[4]//span[@itemprop="price"]/text()').extract()[0].strip().replace('$', '').replace(',', '') except: price = opt.select(u'./td[4]//span/text()').extract() if not price: price = opt.select(u'./td[3]//span[contains(text(), "$")]/text()').extract() p['price'] = price[0].strip().replace('$', '').replace(',', '') if p.get('identifier') and p.get('price') > 0: yield p elif product.get('identifier') and product.get('price') > 0: yield product